Spaces:

uc-msai631-2025-spring
/

Group_5_Project

Sleeping

App Files Files Community

StevenMSAI commited on Oct 11

Commit

405b8de

verified ·

1 Parent(s): 97ac949

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -2

app.py CHANGED Viewed

@@ -1,3 +1,125 @@
 import gradio as gr
-def greet(name): return f"Hello {name}!!"
-gr.Interface(fn=greet, inputs="text", outputs="text", title="Group 5 Project").launch()

+import os
+import io
 import gradio as gr
+import faiss
+import numpy as np
+from pypdf import PdfReader
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+# ---- Models (CPU-friendly) ----
+# We're using Hugging Face's free tier, which is 2 virtual
+# cores and 16gb ram only. So we need to keep these lightweight + cpu-only
+EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # small & fast on CPU
+GEN_MODEL_NAME = "google/flan-t5-small"                      # text2text model that runs on CPU
+embedder = SentenceTransformer(EMBED_MODEL_NAME)
+generator = pipeline("text2text-generation", model=GEN_MODEL_NAME)
+# ---- PDF to text ----
+def pdfs_to_texts(files):
+    texts = []
+    for f in files:
+        # f is an object from Gradio that read bytes for pypdf
+        reader = PdfReader(io.BytesIO(f.read()))
+        pages = [page.extract_text() or "" for page in reader.pages]
+        texts.append("\n".join(pages))
+    return texts
+# ---- Chunking ----
+def chunk_text(text, chunk_size=600, overlap=120):
+    words = text.split()
+    chunks = []
+    i = 0
+    while i < len(words):
+        chunk = words[i:i+chunk_size]
+        chunks.append(" ".join(chunk))
+        i += chunk_size - overlap
+    return chunks
+# ---- Build FAISS index from uploaded PDFs ----
+index = None
+corpus_chunks = []
+def build_index(files, progress=gr.Progress()):
+    global index, corpus_chunks
+    texts = pdfs_to_texts(files)
+    # basic cleanup + chunk
+    corpus_chunks = []
+    for t in texts:
+        if not t.strip():
+            continue
+        corpus_chunks += chunk_text(t)
+    if not corpus_chunks:
+        return "No text extracted from PDFs.", None
+    progress(0.3, desc="Embedding chunks…")
+    embeddings = embedder.encode(corpus_chunks, convert_to_numpy=True, show_progress_bar=False)
+    d = embeddings.shape[1]
+    progress(0.6, desc="Creating FAISS index…")
+    index = faiss.IndexFlatIP(d)  # cosine via inner product on normalized vectors
+    # normalize to unit length to approximate cosine similarity
+    norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-10
+    embeddings = embeddings / norms
+    index.add(embeddings.astype(np.float32))
+    return f"Indexed {len(corpus_chunks)} chunks.", len(corpus_chunks)
+# ---- RAG query -> retrieve -> generate ----
+def answer_question(question, top_k=5, max_new_tokens=256):
+    if index is None or not corpus_chunks:
+        return "Index not built yet. Upload PDFs and click **Build Index** first."
+    # embed query (normalize for inner product)
+    q = embedder.encode([question], convert_to_numpy=True)
+    q = q / (np.linalg.norm(q, axis=1, keepdims=True) + 1e-10)
+    D, I = index.search(q.astype(np.float32), int(top_k))
+    retrieved = [corpus_chunks[i] for i in I[0] if i < len(corpus_chunks)]
+    context = "\n\n".join(retrieved)
+    prompt = (
+        "You are a helpful study assistant. Using ONLY the context, answer the question.\n"
+        "If the answer isn't in the context, say you don't have enough information.\n\n"
+        f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
+    )
+    out = generator(prompt, max_new_tokens=int(max_new_tokens), temperature=0.2)
+    return out[0]["generated_text"].strip()
+# ---- Gradio v5 UI (Blocks) ----
+with gr.Blocks(title="Group 5 Study Helper (RAG)") as demo:
+    gr.Markdown("# Group 5 Study Helper (RAG)\nUpload PDFs → Build Index → Ask questions.")
+    with gr.Row():
+        file_in = gr.Files(file_types=[".pdf"], label="Upload PDF files")
+    with gr.Row():
+        build_btn = gr.Button("Build Index", variant="primary")
+        status = gr.Markdown()
+        chunk_count = gr.Number(label="Chunk count", interactive=False)
+    with gr.Row():
+        question = gr.Textbox(label="Your question")
+    with gr.Row():
+        topk = gr.Slider(1, 10, value=5, step=1, label="Top-K passages")
+        max_tokens = gr.Slider(64, 512, value=256, step=16, label="Max new tokens")
+    with gr.Row():
+        ask_btn = gr.Button("Ask", variant="primary")
+    with gr.Row():
+        answer = gr.Markdown(label="Answer")
+    def _build(files):
+        msg, n = build_index(files)
+        return msg, n or 0
+    build_btn.click(_build, inputs=[file_in], outputs=[status, chunk_count])
+    ask_btn.click(answer_question, inputs=[question, topk, max_tokens], outputs=[answer])
+demo.launch()