Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- README.md +15 -14
- app.py +53 -0
- faiss_index.bin +3 -0
- faiss_meta.pkl +3 -0
- requirements.txt +5 -0
README.md
CHANGED
|
@@ -1,14 +1,15 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
| 1 |
+
# Semantic Search over Substack Posts
|
| 2 |
+
|
| 3 |
+
This project builds a semantic search engine over a collection of HTML posts.
|
| 4 |
+
|
| 5 |
+
## Steps
|
| 6 |
+
|
| 7 |
+
1. Place all .html files into a folder named posts/
|
| 8 |
+
2. Run:
|
| 9 |
+
|
| 10 |
+
```
|
| 11 |
+
pip install -r requirements.txt
|
| 12 |
+
python src/build_index.py
|
| 13 |
+
python app.py
|
| 14 |
+
```
|
| 15 |
+
3. The app will load the FAISS database and start a Gradio interface.
|
app.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pickle
|
| 3 |
+
import faiss
|
| 4 |
+
import gradio as gr
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
|
| 7 |
+
## App Originally Created for Chroma, However Apple Silicon Errors provided too tough to resolve. Accordingly ChatGPT 5.1 was consulted at 12:50pm on 11/25/25 for assistance in understanding FAISS as suitable alterantive.
|
| 8 |
+
|
| 9 |
+
ROOT = os.path.dirname(os.path.abspath(__file__))
|
| 10 |
+
|
| 11 |
+
INDEX_PATH = os.path.join(ROOT, "faiss_index.bin")
|
| 12 |
+
META_PATH = os.path.join(ROOT, "faiss_meta.pkl")
|
| 13 |
+
|
| 14 |
+
# Load FAISS index
|
| 15 |
+
index = faiss.read_index(INDEX_PATH)
|
| 16 |
+
|
| 17 |
+
# Load metadata
|
| 18 |
+
texts, ids, meta = pickle.load(open(META_PATH, "rb"))
|
| 19 |
+
|
| 20 |
+
# Load embedding model
|
| 21 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 22 |
+
|
| 23 |
+
def semantic_search(query, k=3):
|
| 24 |
+
if not query.strip():
|
| 25 |
+
return "Enter a search query."
|
| 26 |
+
|
| 27 |
+
q_emb = model.encode([query]).astype("float32")
|
| 28 |
+
D, I = index.search(q_emb, k)
|
| 29 |
+
|
| 30 |
+
out = "# Search Results\n\n"
|
| 31 |
+
for rank, idx in enumerate(I[0], start=1):
|
| 32 |
+
src = meta[idx]["source"]
|
| 33 |
+
chunk = meta[idx]["chunk"]
|
| 34 |
+
text = texts[idx]
|
| 35 |
+
out += f"### Result {rank}\n"
|
| 36 |
+
out += f"**Source:** {src} | **Chunk:** {chunk}\n\n"
|
| 37 |
+
out += f"{text}\n\n---\n\n"
|
| 38 |
+
|
| 39 |
+
return out
|
| 40 |
+
|
| 41 |
+
demo = gr.Interface(
|
| 42 |
+
fn=semantic_search,
|
| 43 |
+
inputs=[
|
| 44 |
+
gr.Textbox(label="Query", lines=2),
|
| 45 |
+
gr.Slider(1, 10, value=3, step=1, label="Results")
|
| 46 |
+
],
|
| 47 |
+
outputs=gr.Markdown(label="Results"),
|
| 48 |
+
title="FAISS Semantic Search Engine",
|
| 49 |
+
description="Search Substack posts using semantic similarity."
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
demo.launch()
|
faiss_index.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12e0b4a59a87396a26926072f76005bdf961ba743d03dc2a3e92f5bef25feec1
|
| 3 |
+
size 474669
|
faiss_meta.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a48fdd6c4bf1c3fe343962a123aabc1eeb8d1dc2dbd95c84bce69b017b31efde
|
| 3 |
+
size 135075
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
faiss-cpu
|
| 3 |
+
sentence-transformers
|
| 4 |
+
torch
|
| 5 |
+
beautifulsoup4
|