Spaces:

ishmeet-yo
/

ISH_harry_potter_rag

Sleeping

App Files Files Community

ishmeet-yo commited on Dec 20, 2025

Commit

b34d6f6

verified ·

1 Parent(s): a21be97

Upload 7 files

Browse files

Files changed (7) hide show

Dockerfile +10 -0
README.md +7 -11
app/data/harry_potter_1.txt +0 -0
app/main.py +32 -0
app/rag.py +158 -0
requirements.txt +6 -0
templates/index.html +49 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+FROM python:3.10
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,7 @@
----
-title: ISH Harry Potter Rag
-emoji: 👀
-colorFrom: purple
-colorTo: purple
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Harry Potter RAG
+Semantic Retrieval-Augmented Generation system using FastAPI and Sentence Transformers.
+## Run locally
+```bash
+uvicorn app.main:app --reload

app/data/harry_potter_1.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

app/main.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from fastapi import FastAPI, Request
+from fastapi.templating import Jinja2Templates
+from fastapi.staticfiles import StaticFiles
+from rag import load_data, retrieve_chunks
+app = FastAPI()
+templates = Jinja2Templates(directory="templates")
+app.mount("/static", StaticFiles(directory="static"), name="static")
+chunks, heads = load_data()
+@app.get("/")
+def home(request: Request):
+    return templates.TemplateResponse(
+        "index.html",
+        {"request": request}
+    )
+@app.post("/search")
+async def search(request: Request):
+    body = await request.json()
+    query = body["query"]
+    retrieved = retrieve_chunks(query, chunks, heads)
+    answer = "\n\n".join(retrieved[:2])
+    return {
+        "answer": answer,
+        "sources": retrieved
+    }

app/rag.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import os
+import pickle
+import hashlib
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.preprocessing import normalize
+CACHE_DIR = "app/cache"
+DATA_DIR = "app/data"
+MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+def compute_hash(files):
+    h = hashlib.md5()
+    for f in files:
+        with open(f, "rb") as fp:
+            h.update(fp.read())
+    return h.hexdigest()
+def load_documents():
+    files = [
+        os.path.join(DATA_DIR, f)
+        for f in os.listdir(DATA_DIR)
+        if f.endswith(".txt")
+    ]
+    texts = []
+    for f in files:
+        with open(f, encoding="utf-8", errors="ignore") as fp:
+            texts.append(fp.read())
+    return texts, files
+def chunk_text(text, size=500, overlap=100):
+    words = text.split()
+    chunks = []
+    i = 0
+    while i < len(words):
+        chunk = words[i:i+size]
+        chunks.append(" ".join(chunk))
+        i += size - overlap
+    return chunks
+def chunk_documents(texts):
+    chunks = []
+    for t in texts:
+        chunks.extend(chunk_text(t))
+    return chunks
+def build_embeddings(chunks):
+    model = SentenceTransformer(MODEL_NAME)
+    semantic = normalize(model.encode(chunks))
+    narrative = normalize(model.encode(
+        ["Story context: " + c for c in chunks]
+    ))
+    entity = normalize(model.encode(chunks))
+    tfidf = TfidfVectorizer()
+    tfidf_matrix = tfidf.fit_transform(chunks)
+    return {
+        "semantic": semantic,
+        "narrative": narrative,
+        "entity": entity,
+        "tfidf": tfidf,
+        "tfidf_matrix": tfidf_matrix,
+        "model": model
+    }
+def save_cache(chunks, heads, dataset_hash):
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    np.save(f"{CACHE_DIR}/semantic.npy", heads["semantic"])
+    np.save(f"{CACHE_DIR}/narrative.npy", heads["narrative"])
+    np.save(f"{CACHE_DIR}/entity.npy", heads["entity"])
+    with open(f"{CACHE_DIR}/chunks.pkl", "wb") as f:
+        pickle.dump(chunks, f)
+    with open(f"{CACHE_DIR}/tfidf.pkl", "wb") as f:
+        pickle.dump(heads["tfidf"], f)
+    with open(f"{CACHE_DIR}/tfidf_matrix.pkl", "wb") as f:
+        pickle.dump(heads["tfidf_matrix"], f)
+    with open(f"{CACHE_DIR}/hash.txt", "w") as f:
+        f.write(dataset_hash)
+def load_cache():
+    with open(f"{CACHE_DIR}/chunks.pkl", "rb") as f:
+        chunks = pickle.load(f)
+    heads = {
+        "semantic": np.load(f"{CACHE_DIR}/semantic.npy"),
+        "narrative": np.load(f"{CACHE_DIR}/narrative.npy"),
+        "entity": np.load(f"{CACHE_DIR}/entity.npy")
+    }
+    with open(f"{CACHE_DIR}/tfidf.pkl", "rb") as f:
+        heads["tfidf"] = pickle.load(f)
+    with open(f"{CACHE_DIR}/tfidf_matrix.pkl", "rb") as f:
+        heads["tfidf_matrix"] = pickle.load(f)
+    heads["model"] = SentenceTransformer(MODEL_NAME)
+    return chunks, heads
+def load_data():
+    texts, files = load_documents()
+    chunks = chunk_documents(texts)
+    dataset_hash = compute_hash(files)
+    hash_path = f"{CACHE_DIR}/hash.txt"
+    if os.path.exists(hash_path):
+        with open(hash_path) as f:
+            cached_hash = f.read().strip()
+    else:
+        cached_hash = None
+    if cached_hash == dataset_hash:
+        print("Loading embeddings from cache")
+        return load_cache()
+    print("Building embeddings")
+    heads = build_embeddings(chunks)
+    save_cache(chunks, heads, dataset_hash)
+    return chunks, heads
+def retrieve_chunks(query, chunks, heads, k=5):
+    model = heads["model"]
+    q_sem = normalize(model.encode([query]))
+    q_nav = normalize(model.encode(["Story question: " + query]))
+    sem_score = heads["semantic"] @ q_sem.T
+    nav_score = heads["narrative"] @ q_nav.T
+    q_tfidf = heads["tfidf"].transform([query])
+    key_score = heads["tfidf_matrix"] @ q_tfidf.T
+    final = (
+        0.45 * sem_score +
+        0.35 * nav_score +
+        0.20 * key_score.toarray()
+    )
+    idx = np.argsort(final.flatten())[::-1][:k]
+    return [chunks[i] for i in idx]

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi
+uvicorn
+sentence-transformers
+numpy
+scikit-learn
+jinja2

templates/index.html ADDED Viewed

	@@ -0,0 +1,49 @@

+<!DOCTYPE html>
+<html>
+<head>
+  <title>Harry Potter RAG</title>
+  <script src="https://cdn.tailwindcss.com"></script>
+</head>
+<body class="bg-zinc-50">
+  <div class="max-w-3xl mx-auto py-20">
+    <h1 class="text-4xl font-semibold mb-6">
+     Harry Potter Semantic Search
+    </h1>
+    <input
+      id="query"
+      class="w-full p-4 rounded-xl shadow"
+      placeholder="Ask something..."
+    />
+    <button
+      onclick="search()"
+      class="mt-4 px-6 py-3 bg-black text-white rounded-xl"
+    >
+      Search
+    </button>
+    <div id="answer" class="mt-8"></div>
+  </div>
+<script>
+async function search() {
+  const q = document.getElementById("query").value;
+  const res = await fetch("/search", {
+    method: "POST",
+    headers: {"Content-Type": "application/json"},
+    body: JSON.stringify({query: q})
+  });
+  const data = await res.json();
+  document.getElementById("answer").innerHTML =
+    `<div class="p-6 bg-white rounded-xl shadow">
+      ${data.answer}
+    </div>`;
+}
+</script>
+</body>
+</html>