Spaces:

yoniif
/

final_assignment

Sleeping

App Files Files Community

yoniif commited on Aug 8, 2025

Commit

607d996

verified ·

1 Parent(s): fadc020

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -32

app.py CHANGED Viewed

@@ -10,40 +10,53 @@ from sentence_transformers import SentenceTransformer, util
 ### STEP 1: Download and unzip the influencer dataset from Hugging Face
-url = "https://huggingface.co/spaces/yoniif/final_assignment/resolve/main/top_influencers.zip"
-zip_path = "top_100_influencers.zip"
-# Download zip file if not already present
-if not os.path.exists(zip_path):
-    print("📥 Downloading influencer dataset...")
-    headers = {"User-Agent": "Mozilla/5.0"}
-    r = requests.get(url, headers=headers)
-    # Confirm file is binary ZIP
-    if r.status_code != 200 or b"PK" not in r.content[:10]:
-        raise ValueError("❌ Invalid ZIP file downloaded. Check URL or access permissions.")
-    with open(zip_path, "wb") as f:
-        f.write(r.content)
-# Unzip the file into a folder
-unzip_dir = "influencer_data"
-if not os.path.exists(unzip_dir):
-    print("📦 Unzipping dataset...")
-    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-        zip_ref.extractall(unzip_dir)
-### STEP 2: Merge all CSVs into one
-print("🔗 Merging influencer files...")
-all_dfs = []
-for file in os.listdir(unzip_dir):
-    if file.endswith(".csv"):
-        df = pd.read_csv(os.path.join(unzip_dir, file))
-        df["Source File"] = file  # Optional: keep track of file origin
-        all_dfs.append(df)
-df = pd.concat(all_dfs, ignore_index=True)
 # Basic cleanup
 df.drop_duplicates(inplace=True)

 ### STEP 1: Download and unzip the influencer dataset from Hugging Face
+# url = "https://huggingface.co/spaces/yoniif/final_assignment/resolve/main/top_influencers.zip"
+# zip_path = "top_100_influencers.zip"
+# # Download zip file if not already present
+# if not os.path.exists(zip_path):
+#     print("📥 Downloading influencer dataset...")
+#     headers = {"User-Agent": "Mozilla/5.0"}
+#     r = requests.get(url, headers=headers)
+#     # Confirm file is binary ZIP
+#     if r.status_code != 200 or b"PK" not in r.content[:10]:
+#         raise ValueError("❌ Invalid ZIP file downloaded. Check URL or access permissions.")
+#     with open(zip_path, "wb") as f:
+#         f.write(r.content)
+# # Unzip the file into a folder
+# unzip_dir = "influencer_data"
+# if not os.path.exists(unzip_dir):
+#     print("📦 Unzipping dataset...")
+#     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+#         zip_ref.extractall(unzip_dir)
+# ### STEP 2: Merge all CSVs into one
+# print("🔗 Merging influencer files...")
+# all_dfs = []
+# for file in os.listdir(unzip_dir):
+#     if file.endswith(".csv"):
+#         df = pd.read_csv(os.path.join(unzip_dir, file))
+#         df["Source File"] = file  # Optional: keep track of file origin
+#         all_dfs.append(df)
+# df = pd.concat(all_dfs, ignore_index=True)
+# ✅ Load the combined CSV directly
+df = pd.read_csv("top_100_influencers_combined_sample.csv")
+# Fill NA just in case
+df.fillna("", inplace=True)
+# Combine fields for embeddings
+df["profile_text"] = df["Name"] + " - " + df["Platform"] + " - " + df["Niche"] + " - " + df["Country"]
 # Basic cleanup
 df.drop_duplicates(inplace=True)