yoniif commited on
Commit
607d996
Β·
verified Β·
1 Parent(s): fadc020

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -32
app.py CHANGED
@@ -10,40 +10,53 @@ from sentence_transformers import SentenceTransformer, util
10
 
11
  ### STEP 1: Download and unzip the influencer dataset from Hugging Face
12
 
13
- url = "https://huggingface.co/spaces/yoniif/final_assignment/resolve/main/top_influencers.zip"
14
- zip_path = "top_100_influencers.zip"
15
-
16
- # Download zip file if not already present
17
- if not os.path.exists(zip_path):
18
- print("πŸ“₯ Downloading influencer dataset...")
19
- headers = {"User-Agent": "Mozilla/5.0"}
20
- r = requests.get(url, headers=headers)
21
 
22
- # Confirm file is binary ZIP
23
- if r.status_code != 200 or b"PK" not in r.content[:10]:
24
- raise ValueError("❌ Invalid ZIP file downloaded. Check URL or access permissions.")
25
 
26
- with open(zip_path, "wb") as f:
27
- f.write(r.content)
28
-
29
- # Unzip the file into a folder
30
- unzip_dir = "influencer_data"
31
- if not os.path.exists(unzip_dir):
32
- print("πŸ“¦ Unzipping dataset...")
33
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
34
- zip_ref.extractall(unzip_dir)
35
-
36
- ### STEP 2: Merge all CSVs into one
37
-
38
- print("πŸ”— Merging influencer files...")
39
- all_dfs = []
40
- for file in os.listdir(unzip_dir):
41
- if file.endswith(".csv"):
42
- df = pd.read_csv(os.path.join(unzip_dir, file))
43
- df["Source File"] = file # Optional: keep track of file origin
44
- all_dfs.append(df)
45
-
46
- df = pd.concat(all_dfs, ignore_index=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  # Basic cleanup
49
  df.drop_duplicates(inplace=True)
 
10
 
11
  ### STEP 1: Download and unzip the influencer dataset from Hugging Face
12
 
13
+ # url = "https://huggingface.co/spaces/yoniif/final_assignment/resolve/main/top_influencers.zip"
14
+ # zip_path = "top_100_influencers.zip"
15
+
16
+ # # Download zip file if not already present
17
+ # if not os.path.exists(zip_path):
18
+ # print("πŸ“₯ Downloading influencer dataset...")
19
+ # headers = {"User-Agent": "Mozilla/5.0"}
20
+ # r = requests.get(url, headers=headers)
21
 
22
+ # # Confirm file is binary ZIP
23
+ # if r.status_code != 200 or b"PK" not in r.content[:10]:
24
+ # raise ValueError("❌ Invalid ZIP file downloaded. Check URL or access permissions.")
25
 
26
+ # with open(zip_path, "wb") as f:
27
+ # f.write(r.content)
28
+
29
+ # # Unzip the file into a folder
30
+ # unzip_dir = "influencer_data"
31
+ # if not os.path.exists(unzip_dir):
32
+ # print("πŸ“¦ Unzipping dataset...")
33
+ # with zipfile.ZipFile(zip_path, 'r') as zip_ref:
34
+ # zip_ref.extractall(unzip_dir)
35
+
36
+ # ### STEP 2: Merge all CSVs into one
37
+
38
+ # print("πŸ”— Merging influencer files...")
39
+ # all_dfs = []
40
+ # for file in os.listdir(unzip_dir):
41
+ # if file.endswith(".csv"):
42
+ # df = pd.read_csv(os.path.join(unzip_dir, file))
43
+ # df["Source File"] = file # Optional: keep track of file origin
44
+ # all_dfs.append(df)
45
+
46
+ # df = pd.concat(all_dfs, ignore_index=True)
47
+
48
+
49
+
50
+ # βœ… Load the combined CSV directly
51
+ df = pd.read_csv("top_100_influencers_combined_sample.csv")
52
+
53
+ # Fill NA just in case
54
+ df.fillna("", inplace=True)
55
+
56
+ # Combine fields for embeddings
57
+ df["profile_text"] = df["Name"] + " - " + df["Platform"] + " - " + df["Niche"] + " - " + df["Country"]
58
+
59
+
60
 
61
  # Basic cleanup
62
  df.drop_duplicates(inplace=True)