File size: 4,986 Bytes
581cab8 5725b7b 11db2b7 5725b7b 030d162 5725b7b 39343bf 14ce23c d313634 a78fc34 11db2b7 5725b7b 581cab8 5725b7b 11db2b7 5725b7b 11db2b7 581cab8 5725b7b 581cab8 2e057a8 a045bae 2e057a8 5725b7b 581cab8 2e057a8 5725b7b 2e057a8 5725b7b 2e057a8 5725b7b 2e057a8 5725b7b 581cab8 5725b7b 581cab8 5725b7b 581cab8 5725b7b 581cab8 5725b7b 581cab8 5725b7b 581cab8 5725b7b 581cab8 030d162 5725b7b 581cab8 5725b7b 581cab8 5725b7b 581cab8 5725b7b 581cab8 5725b7b 581cab8 5725b7b 581cab8 5725b7b 581cab8 5725b7b 581cab8 030d162 5725b7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
# -*- coding: utf-8 -*-
import os
import json
import re
from itertools import chain, islice
import numpy as np
from gensim.models import Word2Vec
from tqdm import tqdm
import faiss
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
from huggingface_hub import hf_hub_download, login
from huggingface_hub import HfApi
# Load token from Hugging Face Secrets
HF_TOKEN = os.environ.get("RedditSemanticSearch")
# Define target subreddits
target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]
# Function to stream JSONL Reddit files from HF Hub
def load_reddit_split(subreddit_name):
file_path = hf_hub_download(
repo_id="HuggingFaceGECLM/REDDIT_comments",
filename=f"{subreddit_name}.jsonl"
)
with open(file_path, "r") as f:
for line in f:
yield json.loads(line)
# Combine subreddit data
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
import pandas as pd
import re
from itertools import islice
# Load a sample of the dataset (e.g., 100,000 records for performance)
comments = [{"body": ex["body"]} for ex in islice(combined_dataset, 100000)]
# Convert to DataFrame
df = pd.DataFrame(comments)
# Clean text function
def clean_body(text):
text = text.lower()
text = re.sub(r"http\S+|www\S+|https\S+", "", text)
text = re.sub(r"[^a-zA-Z\s]", "", text)
return re.sub(r"\s+", " ", text).strip()
# Apply cleaning
df["clean"] = df["body"].apply(clean_body)
# Chunk every 5 rows
chunk_size = 5
df["chunk_id"] = df.index // chunk_size
df_chunked = df.groupby("chunk_id")["clean"].apply(lambda texts: " ".join(texts)).reset_index()
df_chunked.rename(columns={"clean": "chunk_text"}, inplace=True)
# Final list for embedding
chunked_comments = df_chunked["chunk_text"].tolist()
# Create subreddit labels
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
subreddit_labels = []
for example in combined_dataset:
subreddit_labels.append(example["subreddit_name_prefixed"])
if len(subreddit_labels) >= len(chunked_comments):
break
# Tokenize
def clean_text(text):
text = text.lower()
text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
text = re.sub(r"[^a-zA-Z\s]", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
tokenized_chunks = []
for chunk in tqdm(chunked_comments):
cleaned = clean_text(chunk)
tokens = cleaned.split()
tokenized_chunks.append(tokens)
# Train Word2Vec
model = Word2Vec(sentences=tokenized_chunks, vector_size=100, window=5, min_count=2, workers=4, sg=1)
model.save("reddit_word2vec.model")
# Embedding function
def get_chunk_embedding(chunk_tokens, model):
vectors = [model.wv[token] for token in chunk_tokens if token in model.wv]
if not vectors:
return np.zeros(model.vector_size)
return np.mean(vectors, axis=0)
chunk_embeddings = [get_chunk_embedding(tokens, model) for tokens in tokenized_chunks]
embedding_matrix = np.array(chunk_embeddings).astype("float32")
# Build FAISS index
index = faiss.IndexFlatL2(model.vector_size)
index.add(embedding_matrix)
faiss.write_index(index, "reddit_faiss.index")
# Load model and index for search API
model = Word2Vec.load("reddit_word2vec.model")
index = faiss.read_index("reddit_faiss.index")
subreddit_map = {i: label for i, label in enumerate(subreddit_labels)}
unique_subreddits = sorted(set(subreddit_labels))
original_chunks = [" ".join(tokens) for tokens in tokenized_chunks]
# Search function
def embed_text(text):
tokens = text.lower().split()
vectors = [model.wv[token] for token in tokens if token in model.wv]
return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
def search_reddit(query, selected_subreddit, top_k=5):
query_vec = embed_text(query).astype("float32").reshape(1, -1)
D, I = index.search(query_vec, top_k * 2)
results = []
for idx in I[0]:
if idx < len(chunked_comments) and subreddit_map[idx] == selected_subreddit:
results.append(f"🔸 {chunked_comments[idx]}")
if len(results) >= top_k:
break
return "\n\n".join(results) if results else "⚠️ No relevant results found."
# Gradio UI
with gr.Blocks(theme=gr.themes.Base(primary_hue="orange")) as demo:
gr.Image(value="https://1000logos.net/wp-content/uploads/2017/05/Reddit-Logo.png", show_label=False, height=100)
gr.Markdown("## Reddit Semantic Search (Powered by Word2Vec + FAISS)\n_Disclaimer: Prototype, not affiliated with Reddit Inc._")
with gr.Row():
query = gr.Textbox(label="Enter Reddit-style query")
subreddit_dropdown = gr.Dropdown(choices=unique_subreddits, label="Choose Subreddit")
output = gr.Textbox(label="Matching Comments", lines=10)
search_btn = gr.Button("🔍 Search")
search_btn.click(fn=search_reddit, inputs=[query, subreddit_dropdown], outputs=output)
demo.launch(share=True)
|