DTanzillo commited on
Commit
2dd29df
·
verified ·
1 Parent(s): 04f745c

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +15 -14
  2. app.py +53 -0
  3. faiss_index.bin +3 -0
  4. faiss_meta.pkl +3 -0
  5. requirements.txt +5 -0
README.md CHANGED
@@ -1,14 +1,15 @@
1
- ---
2
- title: Substack Search
3
- emoji: 🏆
4
- colorFrom: indigo
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 6.0.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: Dominic's Substack
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
+ # Semantic Search over Substack Posts
2
+
3
+ This project builds a semantic search engine over a collection of HTML posts.
4
+
5
+ ## Steps
6
+
7
+ 1. Place all .html files into a folder named posts/
8
+ 2. Run:
9
+
10
+ ```
11
+ pip install -r requirements.txt
12
+ python src/build_index.py
13
+ python app.py
14
+ ```
15
+ 3. The app will load the FAISS database and start a Gradio interface.
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import faiss
4
+ import gradio as gr
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ ## App Originally Created for Chroma, However Apple Silicon Errors provided too tough to resolve. Accordingly ChatGPT 5.1 was consulted at 12:50pm on 11/25/25 for assistance in understanding FAISS as suitable alterantive.
8
+
9
+ ROOT = os.path.dirname(os.path.abspath(__file__))
10
+
11
+ INDEX_PATH = os.path.join(ROOT, "faiss_index.bin")
12
+ META_PATH = os.path.join(ROOT, "faiss_meta.pkl")
13
+
14
+ # Load FAISS index
15
+ index = faiss.read_index(INDEX_PATH)
16
+
17
+ # Load metadata
18
+ texts, ids, meta = pickle.load(open(META_PATH, "rb"))
19
+
20
+ # Load embedding model
21
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
22
+
23
+ def semantic_search(query, k=3):
24
+ if not query.strip():
25
+ return "Enter a search query."
26
+
27
+ q_emb = model.encode([query]).astype("float32")
28
+ D, I = index.search(q_emb, k)
29
+
30
+ out = "# Search Results\n\n"
31
+ for rank, idx in enumerate(I[0], start=1):
32
+ src = meta[idx]["source"]
33
+ chunk = meta[idx]["chunk"]
34
+ text = texts[idx]
35
+ out += f"### Result {rank}\n"
36
+ out += f"**Source:** {src} | **Chunk:** {chunk}\n\n"
37
+ out += f"{text}\n\n---\n\n"
38
+
39
+ return out
40
+
41
+ demo = gr.Interface(
42
+ fn=semantic_search,
43
+ inputs=[
44
+ gr.Textbox(label="Query", lines=2),
45
+ gr.Slider(1, 10, value=3, step=1, label="Results")
46
+ ],
47
+ outputs=gr.Markdown(label="Results"),
48
+ title="FAISS Semantic Search Engine",
49
+ description="Search Substack posts using semantic similarity."
50
+ )
51
+
52
+ if __name__ == "__main__":
53
+ demo.launch()
faiss_index.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12e0b4a59a87396a26926072f76005bdf961ba743d03dc2a3e92f5bef25feec1
3
+ size 474669
faiss_meta.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a48fdd6c4bf1c3fe343962a123aabc1eeb8d1dc2dbd95c84bce69b017b31efde
3
+ size 135075
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ faiss-cpu
3
+ sentence-transformers
4
+ torch
5
+ beautifulsoup4