Via Huggingface datasets#

Ignore some unnecessary outputs from huggingface datasets

import datasets 
datasets.logging.set_verbosity_error()

Load Review Samples#

from datasets import load_dataset

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
dataset["full"][0]
{'rating': 5.0,
 'title': 'Such a lovely scent but not overpowering.',
 'text': "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!",
 'images': [],
 'asin': 'B00YQ6X8EO',
 'parent_asin': 'B00YQ6X8EO',
 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ',
 'timestamp': 1588687728923,
 'helpful_vote': 0,
 'verified_purchase': True}

Load Item Metadata#

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)
dataset[0]
{'main_category': 'All Beauty',
 'title': 'Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)',
 'average_rating': 4.8,
 'rating_number': 10,
 'features': [],
 'description': [],
 'price': 'None',
 'images': {'hi_res': [None,
   'https://m.media-amazon.com/images/I/71i77AuI9xL._SL1500_.jpg'],
  'large': ['https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg',
   'https://m.media-amazon.com/images/I/41w2yznfuZL.jpg'],
  'thumb': ['https://m.media-amazon.com/images/I/41qfjSfqNyL._SS40_.jpg',
   'https://m.media-amazon.com/images/I/41w2yznfuZL._SS40_.jpg'],
  'variant': ['MAIN', 'PT01']},
 'videos': {'title': [], 'url': [], 'user_id': []},
 'store': 'Howard Products',
 'categories': [],
 'details': '{"Package Dimensions": "7.1 x 5.5 x 3 inches; 2.38 Pounds", "UPC": "617390882781"}',
 'parent_asin': 'B01CUPMQZE',
 'bought_together': None,
 'subtitle': None,
 'author': None}

Load Pure IDs Files (Before Splitting)#

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "0core_rating_only_All_Beauty", trust_remote_code=True)
dataset['full'][0:5]
{'user_id': ['AGKHLEW2SOWHNMFQIJGBECAF7INQ',
  'AGKHLEW2SOWHNMFQIJGBECAF7INQ',
  'AE74DYR3QUGVPZJ3P7RFWBGIX7XQ',
  'AFQLNQNQYFWQZPJQZS6V3NZU4QBQ',
  'AFQLNQNQYFWQZPJQZS6V3NZU4QBQ'],
 'parent_asin': ['B081TJ8YS3',
  'B00YQ6X8EO',
  'B097R46CSY',
  'B08BZ63GMJ',
  'B09JS339BZ'],
 'rating': ['4.0', '5.0', '5.0', '5.0', '1.0'],
 'timestamp': ['1588615855070',
  '1588687728923',
  '1589665266052',
  '1609322563534',
  '1643393630220']}

Load Pure IDs Files (After Splitting)#

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "0core_timestamp_All_Beauty", trust_remote_code=True)

Only need (user, item) interactions:#

dataset['train'][:5]
{'user_id': ['AGKHLEW2SOWHNMFQIJGBECAF7INQ',
  'AGKHLEW2SOWHNMFQIJGBECAF7INQ',
  'AE74DYR3QUGVPZJ3P7RFWBGIX7XQ',
  'AFQLNQNQYFWQZPJQZS6V3NZU4QBQ',
  'AGMJ3EMDVL6OWBJF7CA5RGJLXN5A'],
 'parent_asin': ['B081TJ8YS3',
  'B00YQ6X8EO',
  'B097R46CSY',
  'B08BZ63GMJ',
  'B00R8DXL44'],
 'rating': ['4.0', '5.0', '5.0', '5.0', '4.0'],
 'timestamp': ['1588615855070',
  '1588687728923',
  '1589665266052',
  '1609322563534',
  '1598567408138']}
dataset['valid'][:5]
{'user_id': ['AFQLNQNQYFWQZPJQZS6V3NZU4QBQ',
  'AHREXOGQPZDA6354MHH4ETSF3MCQ',
  'AEYORY2AVPMCPDV57CE337YU5LXA',
  'AFETVW7S5M4LVJ7GTWPCKT7S3YBQ',
  'AGVVUU3QRQBHNASSGI5YQLPYOI2Q'],
 'parent_asin': ['B09JS339BZ',
  'B099DRHW5V',
  'B08BBQ29N5',
  'B01M5KNSQN',
  'B09FF97RHL'],
 'rating': ['1.0', '5.0', '3.0', '1.0', '1.0'],
 'timestamp': ['1643393630220',
  '1631885519443',
  '1634275259292',
  '1649634131604',
  '1648824907536']}
dataset['test'][:5]
{'user_id': ['AFZUK3MTBIBEDQOPAK3OATUOUKLA',
  'AHV6QCNBJNSGLATP56JAWJ3C4G2A',
  'AHV6QCNBJNSGLATP56JAWJ3C4G2A',
  'AHV6QCNBJNSGLATP56JAWJ3C4G2A',
  'AEZ26WGWJ3EOQ4KWSHG77HJAG4EA'],
 'parent_asin': ['B0BFR5WF1R',
  'B0B4JPGX8P',
  'B0B4JP5YD9',
  'B0B8DZ7H5F',
  'B0B7RBK4NJ'],
 'rating': ['1.0', '4.0', '5.0', '4.0', '1.0'],
 'timestamp': ['1675826333052',
  '1660417672640',
  '1660417831321',
  '1663163956007',
  '1662827578220']}

Need additional user historical interactions:#

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "0core_timestamp_w_his_All_Beauty", trust_remote_code=True)
dataset['train'][:5]
{'user_id': ['AGKHLEW2SOWHNMFQIJGBECAF7INQ',
  'AGKHLEW2SOWHNMFQIJGBECAF7INQ',
  'AE74DYR3QUGVPZJ3P7RFWBGIX7XQ',
  'AFQLNQNQYFWQZPJQZS6V3NZU4QBQ',
  'AGMJ3EMDVL6OWBJF7CA5RGJLXN5A'],
 'parent_asin': ['B081TJ8YS3',
  'B00YQ6X8EO',
  'B097R46CSY',
  'B08BZ63GMJ',
  'B00R8DXL44'],
 'rating': ['4.0', '5.0', '5.0', '5.0', '4.0'],
 'timestamp': ['1588615855070',
  '1588687728923',
  '1589665266052',
  '1609322563534',
  '1598567408138'],
 'history': ['', 'B081TJ8YS3', '', '', '']}
dataset['valid'][:5]
{'user_id': ['AFQLNQNQYFWQZPJQZS6V3NZU4QBQ',
  'AHREXOGQPZDA6354MHH4ETSF3MCQ',
  'AEYORY2AVPMCPDV57CE337YU5LXA',
  'AFETVW7S5M4LVJ7GTWPCKT7S3YBQ',
  'AGVVUU3QRQBHNASSGI5YQLPYOI2Q'],
 'parent_asin': ['B09JS339BZ',
  'B099DRHW5V',
  'B08BBQ29N5',
  'B01M5KNSQN',
  'B09FF97RHL'],
 'rating': ['1.0', '5.0', '3.0', '1.0', '1.0'],
 'timestamp': ['1643393630220',
  '1631885519443',
  '1634275259292',
  '1649634131604',
  '1648824907536'],
 'history': ['B08BZ63GMJ', '', '', '', '']}
dataset['test'][:5]
{'user_id': ['AFZUK3MTBIBEDQOPAK3OATUOUKLA',
  'AHV6QCNBJNSGLATP56JAWJ3C4G2A',
  'AHV6QCNBJNSGLATP56JAWJ3C4G2A',
  'AHV6QCNBJNSGLATP56JAWJ3C4G2A',
  'AEZ26WGWJ3EOQ4KWSHG77HJAG4EA'],
 'parent_asin': ['B0BFR5WF1R',
  'B0B4JPGX8P',
  'B0B4JP5YD9',
  'B0B8DZ7H5F',
  'B0B7RBK4NJ'],
 'rating': ['1.0', '4.0', '5.0', '4.0', '1.0'],
 'timestamp': ['1675826333052',
  '1660417672640',
  '1660417831321',
  '1663163956007',
  '1662827578220'],
 'history': ['B0020MKBNW B082FLP15V B00946HGLW',
  'B00N6WHTRG B00NNKWDI6 B00MDKICPK B010B0S67C B00KXFD75M B015ZXMSFQ B00KR4AFJU B01CO73OIQ B00GUTPV4A B01B6V11UY B01KJPFO9W B071R2QPF3 B07GDQPG12 B07KV31WDS B07FZ5HZLM B077YR3333 B07N45YN6C B07J2QZBTP B07NPWK167 B082MTTFZD B07JDD2L3M B07SW7D6ZR B07WNBZQGT B084D86YL8 B082NKQ4ZT B083TLNBJJ B07PRDZ2BH B087D7MVHB B088FBNQXW B085WTCBLG B085NYYLQ8 B08BZ1RHPS B08FRQGYDF B0B2L218H2 B08HXQ3T9K B08KWN77LW B08KYLTK5H B0BTJ6SYKB B07W6H8CGT B08PQ6YXSH B07KQ32Z8C B09KT4RJG6',
  'B00N6WHTRG B00NNKWDI6 B00MDKICPK B010B0S67C B00KXFD75M B015ZXMSFQ B00KR4AFJU B01CO73OIQ B00GUTPV4A B01B6V11UY B01KJPFO9W B071R2QPF3 B07GDQPG12 B07KV31WDS B07FZ5HZLM B077YR3333 B07N45YN6C B07J2QZBTP B07NPWK167 B082MTTFZD B07JDD2L3M B07SW7D6ZR B07WNBZQGT B084D86YL8 B082NKQ4ZT B083TLNBJJ B07PRDZ2BH B087D7MVHB B088FBNQXW B085WTCBLG B085NYYLQ8 B08BZ1RHPS B08FRQGYDF B0B2L218H2 B08HXQ3T9K B08KWN77LW B08KYLTK5H B0BTJ6SYKB B07W6H8CGT B08PQ6YXSH B07KQ32Z8C B09KT4RJG6 B0B4JPGX8P',
  'B00N6WHTRG B00NNKWDI6 B00MDKICPK B010B0S67C B00KXFD75M B015ZXMSFQ B00KR4AFJU B01CO73OIQ B00GUTPV4A B01B6V11UY B01KJPFO9W B071R2QPF3 B07GDQPG12 B07KV31WDS B07FZ5HZLM B077YR3333 B07N45YN6C B07J2QZBTP B07NPWK167 B082MTTFZD B07JDD2L3M B07SW7D6ZR B07WNBZQGT B084D86YL8 B082NKQ4ZT B083TLNBJJ B07PRDZ2BH B087D7MVHB B088FBNQXW B085WTCBLG B085NYYLQ8 B08BZ1RHPS B08FRQGYDF B0B2L218H2 B08HXQ3T9K B08KWN77LW B08KYLTK5H B0BTJ6SYKB B07W6H8CGT B08PQ6YXSH B07KQ32Z8C B09KT4RJG6 B0B4JPGX8P B0B4JP5YD9',
  '']}