Spaces:

GenAIDevTOProd
/

Reddit-SemanticSearch-Prototype

Running

File size: 4,986 Bytes

581cab8
5725b7b
11db2b7
5725b7b
 
030d162
 
 
 
 
5725b7b
 
39343bf
14ce23c
d313634
a78fc34
11db2b7
5725b7b
581cab8
 
5725b7b
11db2b7
5725b7b
 
 
 
 
11db2b7
 
581cab8
5725b7b
 
581cab8
2e057a8
 
 
 
 
 
a045bae
2e057a8
 
5725b7b
 
 
581cab8
 
 
 
 
2e057a8
 
5725b7b
2e057a8
5725b7b
2e057a8
 
 
5725b7b
2e057a8
 
5725b7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581cab8
5725b7b
 
 
 
 
581cab8
5725b7b
581cab8
 
 
5725b7b
 
 
 
 
 
 
 
 
581cab8
5725b7b
581cab8
5725b7b
581cab8
 
5725b7b
581cab8
 
030d162
 
5725b7b
581cab8
5725b7b
581cab8
5725b7b
 
 
581cab8
 
5725b7b
 
581cab8
 
5725b7b
581cab8
 
 
5725b7b
581cab8
5725b7b
 
 
 
581cab8
 
5725b7b
 
 
 
581cab8
 
 
 
030d162
5725b7b

# -*- coding: utf-8 -*-
import os
import json
import re
from itertools import chain, islice
import numpy as np
from gensim.models import Word2Vec
from tqdm import tqdm
import faiss
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
from huggingface_hub import hf_hub_download, login
from huggingface_hub import HfApi

# Load token from Hugging Face Secrets 
HF_TOKEN = os.environ.get("RedditSemanticSearch")

# Define target subreddits
target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]

# Function to stream JSONL Reddit files from HF Hub
def load_reddit_split(subreddit_name):
    file_path = hf_hub_download(
        repo_id="HuggingFaceGECLM/REDDIT_comments",
        filename=f"{subreddit_name}.jsonl"
    )
    with open(file_path, "r") as f:
        for line in f:
            yield json.loads(line)

# Combine subreddit data
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))

import pandas as pd
import re
from itertools import islice

# Load a sample of the dataset (e.g., 100,000 records for performance)
comments = [{"body": ex["body"]} for ex in islice(combined_dataset, 100000)]

# Convert to DataFrame
df = pd.DataFrame(comments)

# Clean text function
def clean_body(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return re.sub(r"\s+", " ", text).strip()

# Apply cleaning
df["clean"] = df["body"].apply(clean_body)

# Chunk every 5 rows
chunk_size = 5
df["chunk_id"] = df.index // chunk_size
df_chunked = df.groupby("chunk_id")["clean"].apply(lambda texts: " ".join(texts)).reset_index()
df_chunked.rename(columns={"clean": "chunk_text"}, inplace=True)

# Final list for embedding
chunked_comments = df_chunked["chunk_text"].tolist()

# Create subreddit labels
combined_dataset = chain(*(load_reddit_split(sub) for sub in target_subreddits))
subreddit_labels = []
for example in combined_dataset:
    subreddit_labels.append(example["subreddit_name_prefixed"])
    if len(subreddit_labels) >= len(chunked_comments):
        break

# Tokenize
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

tokenized_chunks = []
for chunk in tqdm(chunked_comments):
    cleaned = clean_text(chunk)
    tokens = cleaned.split()
    tokenized_chunks.append(tokens)

# Train Word2Vec
model = Word2Vec(sentences=tokenized_chunks, vector_size=100, window=5, min_count=2, workers=4, sg=1)
model.save("reddit_word2vec.model")

# Embedding function
def get_chunk_embedding(chunk_tokens, model):
    vectors = [model.wv[token] for token in chunk_tokens if token in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

chunk_embeddings = [get_chunk_embedding(tokens, model) for tokens in tokenized_chunks]
embedding_matrix = np.array(chunk_embeddings).astype("float32")

# Build FAISS index
index = faiss.IndexFlatL2(model.vector_size)
index.add(embedding_matrix)
faiss.write_index(index, "reddit_faiss.index")

# Load model and index for search API
model = Word2Vec.load("reddit_word2vec.model")
index = faiss.read_index("reddit_faiss.index")
subreddit_map = {i: label for i, label in enumerate(subreddit_labels)}
unique_subreddits = sorted(set(subreddit_labels))
original_chunks = [" ".join(tokens) for tokens in tokenized_chunks]

# Search function
def embed_text(text):
    tokens = text.lower().split()
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

def search_reddit(query, selected_subreddit, top_k=5):
    query_vec = embed_text(query).astype("float32").reshape(1, -1)
    D, I = index.search(query_vec, top_k * 2)
    results = []
    for idx in I[0]:
        if idx < len(chunked_comments) and subreddit_map[idx] == selected_subreddit:
            results.append(f"🔸 {chunked_comments[idx]}")
        if len(results) >= top_k:
            break
    return "\n\n".join(results) if results else "⚠️ No relevant results found."

# Gradio UI
with gr.Blocks(theme=gr.themes.Base(primary_hue="orange")) as demo:
    gr.Image(value="https://1000logos.net/wp-content/uploads/2017/05/Reddit-Logo.png", show_label=False, height=100)
    gr.Markdown("## Reddit Semantic Search (Powered by Word2Vec + FAISS)\n_Disclaimer: Prototype, not affiliated with Reddit Inc._")

    with gr.Row():
        query = gr.Textbox(label="Enter Reddit-style query")
        subreddit_dropdown = gr.Dropdown(choices=unique_subreddits, label="Choose Subreddit")

    output = gr.Textbox(label="Matching Comments", lines=10)
    search_btn = gr.Button("🔍 Search")

    search_btn.click(fn=search_reddit, inputs=[query, subreddit_dropdown], outputs=output)

demo.launch(share=True)