Via Native json
#
Define a “pretty print” function pprint
for dict objects and dataframes.
import json
pprint = lambda x: print(json.dumps(x, indent=2)) if isinstance(x, dict) else display(x)
Load Review Samples#
file = "review_categories/All_Beauty.jsonl"
with open(file, 'r') as fp:
for line in fp:
pprint(json.loads(line.strip()))
break
{
"rating": 5.0,
"title": "Such a lovely scent but not overpowering.",
"text": "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!",
"images": [],
"asin": "B00YQ6X8EO",
"parent_asin": "B00YQ6X8EO",
"user_id": "AGKHLEW2SOWHNMFQIJGBECAF7INQ",
"timestamp": 1588687728923,
"helpful_vote": 0,
"verified_purchase": true
}
Load Item Metadata#
file = "meta_categories/meta_All_Beauty.jsonl"
with open(file, 'r') as fp:
for line in fp:
pprint(json.loads(line.strip()))
break
{
"main_category": "All Beauty",
"title": "Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)",
"average_rating": 4.8,
"rating_number": 10,
"features": [],
"description": [],
"price": null,
"images": [
{
"thumb": "https://m.media-amazon.com/images/I/41qfjSfqNyL._SS40_.jpg",
"large": "https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg",
"variant": "MAIN",
"hi_res": null
},
{
"thumb": "https://m.media-amazon.com/images/I/41w2yznfuZL._SS40_.jpg",
"large": "https://m.media-amazon.com/images/I/41w2yznfuZL.jpg",
"variant": "PT01",
"hi_res": "https://m.media-amazon.com/images/I/71i77AuI9xL._SL1500_.jpg"
}
],
"videos": [],
"store": "Howard Products",
"categories": [],
"details": {
"Package Dimensions": "7.1 x 5.5 x 3 inches; 2.38 Pounds",
"UPC": "617390882781"
},
"parent_asin": "B01CUPMQZE",
"bought_together": null
}
Load Pure IDs Files (Before Splitting)#
file = "rating_only/All_Beauty.csv"
import pandas as pd
df = pd.read_csv(file)
pprint(df.head())
user_id | parent_asin | rating | timestamp | |
---|---|---|---|---|
0 | AGKHLEW2SOWHNMFQIJGBECAF7INQ | B081TJ8YS3 | 4.0 | 1588615855070 |
1 | AGKHLEW2SOWHNMFQIJGBECAF7INQ | B00YQ6X8EO | 5.0 | 1588687728923 |
2 | AE74DYR3QUGVPZJ3P7RFWBGIX7XQ | B097R46CSY | 5.0 | 1589665266052 |
3 | AFQLNQNQYFWQZPJQZS6V3NZU4QBQ | B08BZ63GMJ | 5.0 | 1609322563534 |
4 | AFQLNQNQYFWQZPJQZS6V3NZU4QBQ | B09JS339BZ | 1.0 | 1643393630220 |
Load Pure IDs Files (After Splitting)#
file = "timestamp_split/All_Beauty.{}.csv"
Only need (user, item) interactions:#
for split in ['train', 'valid', 'test']:
df = pd.read_csv(
file.format(split),
usecols=[0,1,2,3],
)
pprint(f"split: {split}")
pprint(df.head())
'split: train'
user_id | parent_asin | rating | timestamp | |
---|---|---|---|---|
0 | AGKHLEW2SOWHNMFQIJGBECAF7INQ | B081TJ8YS3 | 4.0 | 1588615855070 |
1 | AGKHLEW2SOWHNMFQIJGBECAF7INQ | B00YQ6X8EO | 5.0 | 1588687728923 |
2 | AE74DYR3QUGVPZJ3P7RFWBGIX7XQ | B097R46CSY | 5.0 | 1589665266052 |
3 | AFQLNQNQYFWQZPJQZS6V3NZU4QBQ | B08BZ63GMJ | 5.0 | 1609322563534 |
4 | AGMJ3EMDVL6OWBJF7CA5RGJLXN5A | B00R8DXL44 | 4.0 | 1598567408138 |
'split: valid'
user_id | parent_asin | rating | timestamp | |
---|---|---|---|---|
0 | AFQLNQNQYFWQZPJQZS6V3NZU4QBQ | B09JS339BZ | 1.0 | 1643393630220 |
1 | AHREXOGQPZDA6354MHH4ETSF3MCQ | B099DRHW5V | 5.0 | 1631885519443 |
2 | AEYORY2AVPMCPDV57CE337YU5LXA | B08BBQ29N5 | 3.0 | 1634275259292 |
3 | AFETVW7S5M4LVJ7GTWPCKT7S3YBQ | B01M5KNSQN | 1.0 | 1649634131604 |
4 | AGVVUU3QRQBHNASSGI5YQLPYOI2Q | B09FF97RHL | 1.0 | 1648824907536 |
'split: test'
user_id | parent_asin | rating | timestamp | |
---|---|---|---|---|
0 | AFZUK3MTBIBEDQOPAK3OATUOUKLA | B0BFR5WF1R | 1.0 | 1675826333052 |
1 | AHV6QCNBJNSGLATP56JAWJ3C4G2A | B0B4JPGX8P | 4.0 | 1660417672640 |
2 | AHV6QCNBJNSGLATP56JAWJ3C4G2A | B0B4JP5YD9 | 5.0 | 1660417831321 |
3 | AHV6QCNBJNSGLATP56JAWJ3C4G2A | B0B8DZ7H5F | 4.0 | 1663163956007 |
4 | AEZ26WGWJ3EOQ4KWSHG77HJAG4EA | B0B7RBK4NJ | 1.0 | 1662827578220 |
Need additional user historical interactions:#
for split in ['train', 'valid', 'test']:
df = pd.read_csv(
file.format(split),
)
pprint(f"split: {split}")
pprint(df.head())
'split: train'
user_id | parent_asin | rating | timestamp | history | |
---|---|---|---|---|---|
0 | AGKHLEW2SOWHNMFQIJGBECAF7INQ | B081TJ8YS3 | 4.0 | 1588615855070 | NaN |
1 | AGKHLEW2SOWHNMFQIJGBECAF7INQ | B00YQ6X8EO | 5.0 | 1588687728923 | B081TJ8YS3 |
2 | AE74DYR3QUGVPZJ3P7RFWBGIX7XQ | B097R46CSY | 5.0 | 1589665266052 | NaN |
3 | AFQLNQNQYFWQZPJQZS6V3NZU4QBQ | B08BZ63GMJ | 5.0 | 1609322563534 | NaN |
4 | AGMJ3EMDVL6OWBJF7CA5RGJLXN5A | B00R8DXL44 | 4.0 | 1598567408138 | NaN |
'split: valid'
user_id | parent_asin | rating | timestamp | history | |
---|---|---|---|---|---|
0 | AFQLNQNQYFWQZPJQZS6V3NZU4QBQ | B09JS339BZ | 1.0 | 1643393630220 | B08BZ63GMJ |
1 | AHREXOGQPZDA6354MHH4ETSF3MCQ | B099DRHW5V | 5.0 | 1631885519443 | NaN |
2 | AEYORY2AVPMCPDV57CE337YU5LXA | B08BBQ29N5 | 3.0 | 1634275259292 | NaN |
3 | AFETVW7S5M4LVJ7GTWPCKT7S3YBQ | B01M5KNSQN | 1.0 | 1649634131604 | NaN |
4 | AGVVUU3QRQBHNASSGI5YQLPYOI2Q | B09FF97RHL | 1.0 | 1648824907536 | NaN |
'split: test'
user_id | parent_asin | rating | timestamp | history | |
---|---|---|---|---|---|
0 | AFZUK3MTBIBEDQOPAK3OATUOUKLA | B0BFR5WF1R | 1.0 | 1675826333052 | B0020MKBNW B082FLP15V B00946HGLW |
1 | AHV6QCNBJNSGLATP56JAWJ3C4G2A | B0B4JPGX8P | 4.0 | 1660417672640 | B00N6WHTRG B00NNKWDI6 B00MDKICPK B010B0S67C B0... |
2 | AHV6QCNBJNSGLATP56JAWJ3C4G2A | B0B4JP5YD9 | 5.0 | 1660417831321 | B00N6WHTRG B00NNKWDI6 B00MDKICPK B010B0S67C B0... |
3 | AHV6QCNBJNSGLATP56JAWJ3C4G2A | B0B8DZ7H5F | 4.0 | 1663163956007 | B00N6WHTRG B00NNKWDI6 B00MDKICPK B010B0S67C B0... |
4 | AEZ26WGWJ3EOQ4KWSHG77HJAG4EA | B0B7RBK4NJ | 1.0 | 1662827578220 | NaN |