Spaces:

DTanzillo
/

Substack-Search

Runtime error

App Files Files Community

DTanzillo commited on Nov 25, 2025

Commit

2dd29df

verified ·

1 Parent(s): 04f745c

Upload 5 files

Browse files

Files changed (5) hide show

README.md +15 -14
app.py +53 -0
faiss_index.bin +3 -0
faiss_meta.pkl +3 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,14 +1,15 @@
----
-title: Substack Search
-emoji: 🏆
-colorFrom: indigo
-colorTo: yellow
-sdk: gradio
-sdk_version: 6.0.0
-app_file: app.py
-pinned: false
-license: mit
-short_description: Dominic's Substack
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Semantic Search over Substack Posts
+This project builds a semantic search engine over a collection of HTML posts.
+## Steps
+1. Place all .html files into a folder named posts/
+2. Run:
+```
+pip install -r requirements.txt
+python src/build_index.py
+python app.py
+```
+3. The app will load the FAISS database and start a Gradio interface.

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import pickle
+import faiss
+import gradio as gr
+from sentence_transformers import SentenceTransformer
+## App Originally Created for Chroma, However Apple Silicon Errors provided too tough to resolve. Accordingly ChatGPT 5.1 was consulted at 12:50pm on 11/25/25 for assistance in understanding FAISS as suitable alterantive.
+ROOT = os.path.dirname(os.path.abspath(__file__))
+INDEX_PATH = os.path.join(ROOT, "faiss_index.bin")
+META_PATH = os.path.join(ROOT, "faiss_meta.pkl")
+# Load FAISS index
+index = faiss.read_index(INDEX_PATH)
+# Load metadata
+texts, ids, meta = pickle.load(open(META_PATH, "rb"))
+# Load embedding model
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+def semantic_search(query, k=3):
+    if not query.strip():
+        return "Enter a search query."
+    q_emb = model.encode([query]).astype("float32")
+    D, I = index.search(q_emb, k)
+    out = "# Search Results\n\n"
+    for rank, idx in enumerate(I[0], start=1):
+        src = meta[idx]["source"]
+        chunk = meta[idx]["chunk"]
+        text = texts[idx]
+        out += f"### Result {rank}\n"
+        out += f"**Source:** {src} | **Chunk:** {chunk}\n\n"
+        out += f"{text}\n\n---\n\n"
+    return out
+demo = gr.Interface(
+    fn=semantic_search,
+    inputs=[
+        gr.Textbox(label="Query", lines=2),
+        gr.Slider(1, 10, value=3, step=1, label="Results")
+    ],
+    outputs=gr.Markdown(label="Results"),
+    title="FAISS Semantic Search Engine",
+    description="Search Substack posts using semantic similarity."
+)
+if __name__ == "__main__":
+    demo.launch()

faiss_index.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12e0b4a59a87396a26926072f76005bdf961ba743d03dc2a3e92f5bef25feec1
+size 474669

faiss_meta.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a48fdd6c4bf1c3fe343962a123aabc1eeb8d1dc2dbd95c84bce69b017b31efde
+size 135075

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+faiss-cpu
+sentence-transformers
+torch
+beautifulsoup4