Spaces:

GenAIDevTOProd
/

Reddit-SemanticSearch-Prototype

Sleeping

App Files Files Community

Reddit-SemanticSearch-Prototype / app.py

GenAIDevTOProd

Upload app.py

581cab8 verified 9 months ago

raw

history blame

9.52 kB

	# -- coding: utf-8 --
	"""app.py

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1nLqIbyBDiBI96gDZ0TziLNX8I4uWnl9G
	"""

	pip install datasets

	"""Picking subreddits, split=sub as the data on huggingface datasets is split w.r.t subreddits and not train/test/validation.

	Streaming = True, because we don't want to load all the data into local memory

	loading and combining all the iterables together.

	"""

	from datasets import load_dataset, concatenate_datasets

	target_subreddits = ["askscience", "gaming", "technology", "todayilearned", "programming"]

	# Load and stream each subreddit split individually
	datasets = [
	load_dataset("HuggingFaceGECLM/REDDIT_comments", split=sub, streaming=True)
	for sub in target_subreddits
	]

	# Combine into one iterable dataset
	from itertools import chain
	combined_dataset = chain(*datasets)

	"""# Chunking Logic
	- Group Reddit comments into small textual chunks to create a unit of meaning for embedding.

	- Short Reddit comments are noisy and lack semantic depth. Chunking lets us:

	- Aggregate context across comments

	- Improve embedding quality for semantic search

	- Normalize input length for vector similarity

	- We'll group n comments (3-5) per chunk or limit chunk size by token count (100 words).

	Use PySpark for handling the large concatenantion of chunked data
	"""

	from pyspark.sql import SparkSession
	from pyspark.sql.functions import col, udf, monotonically_increasing_id
	from pyspark.sql.types import StringType
	import re
	from itertools import islice

	spark = SparkSession.builder.getOrCreate()

	# Load generator into pandas or write out sample file and read into Spark
	df = spark.createDataFrame([{"body": ex["body"]} for ex in islice(combined_dataset, 100000)])

	# Clean text UDF
	def clean_body(text):
	text = text.lower()
	text = re.sub(r"http\S+\|www\S+\|https\S+", "", text)
	text = re.sub(r"[^a-zA-Z\s]", "", text)
	return re.sub(r"\s+", " ", text).strip()

	clean_udf = udf(clean_body, StringType())
	df_clean = df.withColumn("clean", clean_udf(col("body")))

	# Add row numbers to chunk
	df_indexed = df_clean.withColumn("row_num", monotonically_increasing_id())
	chunk_size = 5
	df_indexed = df_indexed.withColumn("chunk_id", (col("row_num") / chunk_size).cast("int"))

	# Group and concatenate
	from pyspark.sql.functions import collect_list, concat_ws
	df_chunked = df_indexed.groupBy("chunk_id").agg(concat_ws(" ", collect_list("clean")).alias("chunk_text"))

	chunked_comments = df_chunked.select("chunk_text").rdd.map(lambda x: x[0]).collect()

	subreddit_labels = []
	for example in combined_dataset:
	subreddit_labels.append(example["subreddit_name_prefixed"])
	if len(subreddit_labels) >= len(chunked_comments):
	break

	"""Cleaner text = better embeddings. Noise like markdown or links pollute meaning.

	We'll use regex and basic string methods.

	Normalize the text: remove URLs, HTML tags, Reddit-specific formatting, etc.
	"""

	!pip install gensim tqdm

	from gensim.models import Word2Vec
	from tqdm import tqdm
	import re

	def clean_text(text):
	# Lowercase, remove URLs, special chars
	text = text.lower()
	text = re.sub(r"http\S+\|www\S+\|https\S+", "", text, flags=re.MULTILINE)
	text = re.sub(r"[^a-zA-Z\s]", "", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text

	tokenized_chunks = []
	for chunk in tqdm(chunked_comments):
	cleaned = clean_text(chunk)
	tokens = cleaned.split() # Simple whitespace tokenizer
	tokenized_chunks.append(tokens)

	"""Chunking + Tokenizing, removing urls, reddit slang words and unnecessary noisy text information.


	vector_size=100, # Size of word embeddings (dimensionality)

	window=5, # Context window size (how many words to look left/right)

	min_count=2, # Ignores words with frequency < 2 (reduces noise)

	workers=4, # Parallel training threads (CPU cores)

	sg=1 # 1 = Skip-Gram (better for rare words); 0 =CBOW
	"""

	model = Word2Vec(sentences=tokenized_chunks, vector_size=100, window=5, min_count=2, workers=4, sg=1)
	model.save("reddit_word2vec.model")

	"""Training a custom Word2Vec model for embeddings.

	Word2Vec learns dense vector representations (embeddings) for words by capturing their semantic context in a corpus. It enables semantic similarity, clustering, and search.

	Skip-gram learns to predict surrounding words for a given center word. It performs better on small to medium-sized datasets and captures rare word semantics effectively.

	- Word2Vec only generates vectors for individual words, not entire sentences or documents.

	- Each word gets mapped to a dense vector (e.g., 100-dim) that captures its semantic relationships with other words.

	# Why Averaging?
	- It's a simple and surprisingly strong baseline:

	- -Works well in low-resource or custom-trained embedding settings

	- Keeps computation cheap

	- Captures the "semantic center" of the chunk

	Alternative strategies:

	- Weighted average (e.g., using TF-IDF or word frequency)

	- Doc2Vec (learns doc embeddings directly)

	- Transformers (e.g., BERT) for sentence embeddings (but heavier)
	"""

	import numpy as np

	def get_chunk_embedding(chunk_tokens, model):
	vectors = []
	for token in chunk_tokens:
	if token in model.wv:
	vectors.append(model.wv[token])
	if not vectors:
	return np.zeros(model.vector_size)
	return np.mean(vectors, axis=0)

	# Dense embedding for each chunk
	chunk_embeddings = [get_chunk_embedding(tokens, model) for tokens in tokenized_chunks]

	"""Converting variable length chunks to fixed level embeddings"""

	!pip install faiss-cpu

	import faiss

	# Convert embeddings to float32 numpy array
	embedding_matrix = np.array(chunk_embeddings).astype("float32")

	# Initialize FAISS index (L2 similarity)
	index = faiss.IndexFlatL2(model.vector_size)
	index.add(embedding_matrix)

	"""Building FAISS index with the dense vectors generated from avaraging earlier.

	FAISS is optimized for fast, approximate nearest-neighbor search — standard for semantic search pipelines.

	Indexing takes precomputed embeddings (vectors generated from text) and organizes them into a searchable format like FAISS, enabling fast similarity-based retrieval.
	"""

	import faiss
	import numpy as np

	# Embed each chunk using average Word2Vec token embeddings
	def embed_chunk(text, model):
	tokens = text.lower().split()
	vectors = [model.wv[token] for token in tokens if token in model.wv]
	return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

	embeddings = np.array([embed_chunk(chunk, model) for chunk in chunked_comments]).astype("float32")

	# Build and save FAISS index
	index = faiss.IndexFlatL2(model.vector_size)
	index.add(embeddings)
	faiss.write_index(index, "reddit_faiss.index")

	def search(query, model, index, top_k=5):
	tokens = clean_text(query).split()
	query_vec = get_chunk_embedding(tokens, model).astype("float32").reshape(1, -1)

	distances, indices = index.search(query_vec, top_k)
	return indices[0], distances[0]

	original_chunks = [" ".join(tokens) for tokens in tokenized_chunks]

	query = "quantum physics experiments"
	top_ids, top_distances = search(query, model, index)

	for i, idx in enumerate(top_ids):
	print(f"Rank {i+1} \| Distance: {top_distances[i]:.2f}")
	print(original_chunks[idx][:300], "...\n")

	"""# Reddit Semantic Search App"""

	import gradio as gr
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	from PIL import Image

	from gensim.models import Word2Vec
	import faiss
	import numpy as np
	import gradio as gr

	# Load Word2Vec model and FAISS index
	model = Word2Vec.load("reddit_word2vec.model")
	index = faiss.read_index("reddit_faiss.index")

	# Prepare embedding function
	def embed_text(text):
	tokens = text.lower().split()
	vectors = [model.wv[token] for token in tokens if token in model.wv]
	if not vectors:
	return np.zeros(model.vector_size)
	return np.mean(vectors, axis=0)

	# Build subreddit index
	subreddit_map = {i: label for i, label in enumerate(subreddit_labels)}
	unique_subreddits = sorted(set(subreddit_labels)) # for dropdown

	# Semantic search function
	def search_reddit(query, selected_subreddit, top_k=5):
	query_vec = embed_text(query).astype("float32")
	D, I = index.search(np.array([query_vec]), top_k)

	results = []
	for idx in I[0]:
	if idx < len(chunked_comments) and subreddit_map[idx] == selected_subreddit:
	results.append(f"🔸 {chunked_comments[idx]}")
	if len(results) >= top_k:
	break

	if not results:
	return "⚠️ No relevant results found."
	return "\n\n".join(results)

	# Gradio UI
	with gr.Blocks(theme=gr.themes.Base(primary_hue="orange", secondary_hue="gray")) as demo:
	gr.Image(
	value="https://1000logos.net/wp-content/uploads/2017/05/Reddit-Logo.png",
	show_label=False,
	height=100
	)
	gr.Markdown("## Reddit Semantic Search (Powered by Word2Vec + FAISS)\n_Disclaimer: Exterimental prototype, not owned/developed by Reddit Inc_")

	with gr.Row():
	query = gr.Textbox(label="Enter your Reddit-like query", placeholder="e.g. What's new in AI?")

	output = gr.Textbox(label="Top Matching Chunks", lines=10)
	search_btn = gr.Button("🔍 Search")

	search_btn.click(fn=search_reddit, inputs=[query, subreddit_dropdown], outputs=output)

	demo.launch(share=True)