Spaces:

GenAIDevTOProd
/

Reddit-SemanticSearch-Prototype

Sleeping

GenAIDevTOProd commited on Aug 6

Commit

b661309

verified ·

1 Parent(s): cc1c9e4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -19,17 +19,16 @@ HF_TOKEN = os.environ.get("RedditSemanticSearch")
 target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]
 # Function to stream JSONL Reddit files from HF Hub
-def load_reddit_split(subreddit_name):
-    file_path = hf_hub_download(
-        repo_id="HuggingFaceGECLM/REDDIT_comments",
-        filename=f"{subreddit_name}.jsonl"
-    )
-    with open(file_path, "r") as f:
-        for line in f:
-            yield json.loads(line)
-# Combine subreddit data
-combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
 import pandas as pd
 import re

 target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]
 # Function to stream JSONL Reddit files from HF Hub
+from datasets import load_dataset
+# Load full Reddit dataset (assumes it's pre-split by subreddit or has a field)
+dataset = load_dataset("HuggingFaceGECLM/REDDIT_comments", split="train")
+# Filter only relevant subreddits
+dataset = dataset.filter(lambda x: x["subreddit"] in target_subreddits)
+# Take a sample (to limit memory for now)
+comments = [{"body": ex["body"]} for ex in dataset.select(range(100000))]
 import pandas as pd
 import re