Update app.py
Browse files
app.py
CHANGED
|
@@ -10,9 +10,6 @@ import faiss
|
|
| 10 |
import gradio as gr
|
| 11 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 12 |
from huggingface_hub import hf_hub_download, login
|
| 13 |
-
from pyspark.sql import SparkSession
|
| 14 |
-
from pyspark.sql.functions import col, udf, monotonically_increasing_id, collect_list, concat_ws
|
| 15 |
-
from pyspark.sql.types import StringType
|
| 16 |
from huggingface_hub import HfApi
|
| 17 |
|
| 18 |
# Load token from Hugging Face Secrets
|
|
@@ -34,12 +31,15 @@ def load_reddit_split(subreddit_name):
|
|
| 34 |
# Combine subreddit data
|
| 35 |
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
#
|
| 41 |
-
|
| 42 |
-
df = spark.createDataFrame([{"body": ex["body"]} for ex in islice(combined_dataset, 100000)])
|
| 43 |
|
| 44 |
# Clean text function
|
| 45 |
def clean_body(text):
|
|
@@ -48,17 +48,17 @@ def clean_body(text):
|
|
| 48 |
text = re.sub(r"[^a-zA-Z\s]", "", text)
|
| 49 |
return re.sub(r"\s+", " ", text).strip()
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
|
| 54 |
-
#
|
| 55 |
chunk_size = 5
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
df_chunked
|
| 59 |
|
| 60 |
-
#
|
| 61 |
-
chunked_comments = df_chunked
|
| 62 |
|
| 63 |
# Create subreddit labels
|
| 64 |
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
|
|
|
|
| 10 |
import gradio as gr
|
| 11 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 12 |
from huggingface_hub import hf_hub_download, login
|
|
|
|
|
|
|
|
|
|
| 13 |
from huggingface_hub import HfApi
|
| 14 |
|
| 15 |
# Load token from Hugging Face Secrets
|
|
|
|
| 31 |
# Combine subreddit data
|
| 32 |
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
|
| 33 |
|
| 34 |
+
import pandas as pd
|
| 35 |
+
import re
|
| 36 |
+
from itertools import islice
|
| 37 |
+
|
| 38 |
+
# Load a sample of the dataset (e.g., 100,000 records for performance)
|
| 39 |
+
comments = [{"body": ex["body"]} for ex in islice(combined_dataset, 100000)]
|
| 40 |
|
| 41 |
+
# Convert to DataFrame
|
| 42 |
+
df = pd.DataFrame(comments)
|
|
|
|
| 43 |
|
| 44 |
# Clean text function
|
| 45 |
def clean_body(text):
|
|
|
|
| 48 |
text = re.sub(r"[^a-zA-Z\s]", "", text)
|
| 49 |
return re.sub(r"\s+", " ", text).strip()
|
| 50 |
|
| 51 |
+
# Apply cleaning
|
| 52 |
+
df["clean"] = df["body"].apply(clean_body)
|
| 53 |
|
| 54 |
+
# Chunk every 5 rows
|
| 55 |
chunk_size = 5
|
| 56 |
+
df["chunk_id"] = df.index // chunk_size
|
| 57 |
+
df_chunked = df.groupby("chunk_id")["clean"].apply(lambda texts: " ".join(texts)).reset_index()
|
| 58 |
+
df_chunked.rename(columns={"clean": "chunk_text"}, inplace=True)
|
| 59 |
|
| 60 |
+
# Final list for embedding
|
| 61 |
+
chunked_comments = df_chunked["chunk_text"].tolist()
|
| 62 |
|
| 63 |
# Create subreddit labels
|
| 64 |
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
|