Via Native json#

Define a “pretty print” function pprint for dict objects and dataframes.

import json 
pprint = lambda x: print(json.dumps(x, indent=2)) if isinstance(x, dict) else display(x)

Load Review Samples#

file = "review_categories/All_Beauty.jsonl"
with open(file, 'r') as fp:
    for line in fp:
        pprint(json.loads(line.strip()))
        break
{
  "rating": 5.0,
  "title": "Such a lovely scent but not overpowering.",
  "text": "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!",
  "images": [],
  "asin": "B00YQ6X8EO",
  "parent_asin": "B00YQ6X8EO",
  "user_id": "AGKHLEW2SOWHNMFQIJGBECAF7INQ",
  "timestamp": 1588687728923,
  "helpful_vote": 0,
  "verified_purchase": true
}

Load Item Metadata#

file = "meta_categories/meta_All_Beauty.jsonl"
with open(file, 'r') as fp:
    for line in fp:
        pprint(json.loads(line.strip()))
        break
{
  "main_category": "All Beauty",
  "title": "Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)",
  "average_rating": 4.8,
  "rating_number": 10,
  "features": [],
  "description": [],
  "price": null,
  "images": [
    {
      "thumb": "https://m.media-amazon.com/images/I/41qfjSfqNyL._SS40_.jpg",
      "large": "https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg",
      "variant": "MAIN",
      "hi_res": null
    },
    {
      "thumb": "https://m.media-amazon.com/images/I/41w2yznfuZL._SS40_.jpg",
      "large": "https://m.media-amazon.com/images/I/41w2yznfuZL.jpg",
      "variant": "PT01",
      "hi_res": "https://m.media-amazon.com/images/I/71i77AuI9xL._SL1500_.jpg"
    }
  ],
  "videos": [],
  "store": "Howard Products",
  "categories": [],
  "details": {
    "Package Dimensions": "7.1 x 5.5 x 3 inches; 2.38 Pounds",
    "UPC": "617390882781"
  },
  "parent_asin": "B01CUPMQZE",
  "bought_together": null
}

Load Pure IDs Files (Before Splitting)#

file = "rating_only/All_Beauty.csv"
import pandas as pd

df = pd.read_csv(file)
pprint(df.head())
user_id parent_asin rating timestamp
0 AGKHLEW2SOWHNMFQIJGBECAF7INQ B081TJ8YS3 4.0 1588615855070
1 AGKHLEW2SOWHNMFQIJGBECAF7INQ B00YQ6X8EO 5.0 1588687728923
2 AE74DYR3QUGVPZJ3P7RFWBGIX7XQ B097R46CSY 5.0 1589665266052
3 AFQLNQNQYFWQZPJQZS6V3NZU4QBQ B08BZ63GMJ 5.0 1609322563534
4 AFQLNQNQYFWQZPJQZS6V3NZU4QBQ B09JS339BZ 1.0 1643393630220

Load Pure IDs Files (After Splitting)#

file = "timestamp_split/All_Beauty.{}.csv"

Only need (user, item) interactions:#

for split in ['train', 'valid', 'test']:
    df = pd.read_csv(
        file.format(split), 
        usecols=[0,1,2,3],
    )
    pprint(f"split: {split}")
    pprint(df.head())
'split: train'
user_id parent_asin rating timestamp
0 AGKHLEW2SOWHNMFQIJGBECAF7INQ B081TJ8YS3 4.0 1588615855070
1 AGKHLEW2SOWHNMFQIJGBECAF7INQ B00YQ6X8EO 5.0 1588687728923
2 AE74DYR3QUGVPZJ3P7RFWBGIX7XQ B097R46CSY 5.0 1589665266052
3 AFQLNQNQYFWQZPJQZS6V3NZU4QBQ B08BZ63GMJ 5.0 1609322563534
4 AGMJ3EMDVL6OWBJF7CA5RGJLXN5A B00R8DXL44 4.0 1598567408138
'split: valid'
user_id parent_asin rating timestamp
0 AFQLNQNQYFWQZPJQZS6V3NZU4QBQ B09JS339BZ 1.0 1643393630220
1 AHREXOGQPZDA6354MHH4ETSF3MCQ B099DRHW5V 5.0 1631885519443
2 AEYORY2AVPMCPDV57CE337YU5LXA B08BBQ29N5 3.0 1634275259292
3 AFETVW7S5M4LVJ7GTWPCKT7S3YBQ B01M5KNSQN 1.0 1649634131604
4 AGVVUU3QRQBHNASSGI5YQLPYOI2Q B09FF97RHL 1.0 1648824907536
'split: test'
user_id parent_asin rating timestamp
0 AFZUK3MTBIBEDQOPAK3OATUOUKLA B0BFR5WF1R 1.0 1675826333052
1 AHV6QCNBJNSGLATP56JAWJ3C4G2A B0B4JPGX8P 4.0 1660417672640
2 AHV6QCNBJNSGLATP56JAWJ3C4G2A B0B4JP5YD9 5.0 1660417831321
3 AHV6QCNBJNSGLATP56JAWJ3C4G2A B0B8DZ7H5F 4.0 1663163956007
4 AEZ26WGWJ3EOQ4KWSHG77HJAG4EA B0B7RBK4NJ 1.0 1662827578220

Need additional user historical interactions:#

for split in ['train', 'valid', 'test']:
    df = pd.read_csv(
        file.format(split), 
    )
    pprint(f"split: {split}")
    pprint(df.head())
'split: train'
user_id parent_asin rating timestamp history
0 AGKHLEW2SOWHNMFQIJGBECAF7INQ B081TJ8YS3 4.0 1588615855070 NaN
1 AGKHLEW2SOWHNMFQIJGBECAF7INQ B00YQ6X8EO 5.0 1588687728923 B081TJ8YS3
2 AE74DYR3QUGVPZJ3P7RFWBGIX7XQ B097R46CSY 5.0 1589665266052 NaN
3 AFQLNQNQYFWQZPJQZS6V3NZU4QBQ B08BZ63GMJ 5.0 1609322563534 NaN
4 AGMJ3EMDVL6OWBJF7CA5RGJLXN5A B00R8DXL44 4.0 1598567408138 NaN
'split: valid'
user_id parent_asin rating timestamp history
0 AFQLNQNQYFWQZPJQZS6V3NZU4QBQ B09JS339BZ 1.0 1643393630220 B08BZ63GMJ
1 AHREXOGQPZDA6354MHH4ETSF3MCQ B099DRHW5V 5.0 1631885519443 NaN
2 AEYORY2AVPMCPDV57CE337YU5LXA B08BBQ29N5 3.0 1634275259292 NaN
3 AFETVW7S5M4LVJ7GTWPCKT7S3YBQ B01M5KNSQN 1.0 1649634131604 NaN
4 AGVVUU3QRQBHNASSGI5YQLPYOI2Q B09FF97RHL 1.0 1648824907536 NaN
'split: test'
user_id parent_asin rating timestamp history
0 AFZUK3MTBIBEDQOPAK3OATUOUKLA B0BFR5WF1R 1.0 1675826333052 B0020MKBNW B082FLP15V B00946HGLW
1 AHV6QCNBJNSGLATP56JAWJ3C4G2A B0B4JPGX8P 4.0 1660417672640 B00N6WHTRG B00NNKWDI6 B00MDKICPK B010B0S67C B0...
2 AHV6QCNBJNSGLATP56JAWJ3C4G2A B0B4JP5YD9 5.0 1660417831321 B00N6WHTRG B00NNKWDI6 B00MDKICPK B010B0S67C B0...
3 AHV6QCNBJNSGLATP56JAWJ3C4G2A B0B8DZ7H5F 4.0 1663163956007 B00N6WHTRG B00NNKWDI6 B00MDKICPK B010B0S67C B0...
4 AEZ26WGWJ3EOQ4KWSHG77HJAG4EA B0B7RBK4NJ 1.0 1662827578220 NaN