StevenMSAI commited on
Commit
405b8de
·
verified ·
1 Parent(s): 97ac949

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -2
app.py CHANGED
@@ -1,3 +1,125 @@
 
 
1
  import gradio as gr
2
- def greet(name): return f"Hello {name}!!"
3
- gr.Interface(fn=greet, inputs="text", outputs="text", title="Group 5 Project").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
  import gradio as gr
4
+ import faiss
5
+ import numpy as np
6
+ from pypdf import PdfReader
7
+ from sentence_transformers import SentenceTransformer
8
+ from transformers import pipeline
9
+
10
+ # ---- Models (CPU-friendly) ----
11
+ # We're using Hugging Face's free tier, which is 2 virtual
12
+ # cores and 16gb ram only. So we need to keep these lightweight + cpu-only
13
+
14
+ EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # small & fast on CPU
15
+ GEN_MODEL_NAME = "google/flan-t5-small" # text2text model that runs on CPU
16
+
17
+ embedder = SentenceTransformer(EMBED_MODEL_NAME)
18
+ generator = pipeline("text2text-generation", model=GEN_MODEL_NAME)
19
+
20
+ # ---- PDF to text ----
21
+ def pdfs_to_texts(files):
22
+ texts = []
23
+ for f in files:
24
+ # f is an object from Gradio that read bytes for pypdf
25
+ reader = PdfReader(io.BytesIO(f.read()))
26
+ pages = [page.extract_text() or "" for page in reader.pages]
27
+ texts.append("\n".join(pages))
28
+ return texts
29
+
30
+
31
+ # ---- Chunking ----
32
+ def chunk_text(text, chunk_size=600, overlap=120):
33
+ words = text.split()
34
+ chunks = []
35
+ i = 0
36
+ while i < len(words):
37
+ chunk = words[i:i+chunk_size]
38
+ chunks.append(" ".join(chunk))
39
+ i += chunk_size - overlap
40
+ return chunks
41
+
42
+
43
+ # ---- Build FAISS index from uploaded PDFs ----
44
+ index = None
45
+ corpus_chunks = []
46
+
47
+ def build_index(files, progress=gr.Progress()):
48
+ global index, corpus_chunks
49
+ texts = pdfs_to_texts(files)
50
+
51
+ # basic cleanup + chunk
52
+ corpus_chunks = []
53
+ for t in texts:
54
+ if not t.strip():
55
+ continue
56
+ corpus_chunks += chunk_text(t)
57
+
58
+ if not corpus_chunks:
59
+ return "No text extracted from PDFs.", None
60
+
61
+ progress(0.3, desc="Embedding chunks…")
62
+ embeddings = embedder.encode(corpus_chunks, convert_to_numpy=True, show_progress_bar=False)
63
+ d = embeddings.shape[1]
64
+
65
+ progress(0.6, desc="Creating FAISS index…")
66
+ index = faiss.IndexFlatIP(d) # cosine via inner product on normalized vectors
67
+ # normalize to unit length to approximate cosine similarity
68
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-10
69
+ embeddings = embeddings / norms
70
+ index.add(embeddings.astype(np.float32))
71
+
72
+ return f"Indexed {len(corpus_chunks)} chunks.", len(corpus_chunks)
73
+
74
+ # ---- RAG query -> retrieve -> generate ----
75
+ def answer_question(question, top_k=5, max_new_tokens=256):
76
+ if index is None or not corpus_chunks:
77
+ return "Index not built yet. Upload PDFs and click **Build Index** first."
78
+
79
+ # embed query (normalize for inner product)
80
+ q = embedder.encode([question], convert_to_numpy=True)
81
+ q = q / (np.linalg.norm(q, axis=1, keepdims=True) + 1e-10)
82
+
83
+ D, I = index.search(q.astype(np.float32), int(top_k))
84
+ retrieved = [corpus_chunks[i] for i in I[0] if i < len(corpus_chunks)]
85
+
86
+ context = "\n\n".join(retrieved)
87
+ prompt = (
88
+ "You are a helpful study assistant. Using ONLY the context, answer the question.\n"
89
+ "If the answer isn't in the context, say you don't have enough information.\n\n"
90
+ f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
91
+ )
92
+ out = generator(prompt, max_new_tokens=int(max_new_tokens), temperature=0.2)
93
+ return out[0]["generated_text"].strip()
94
+
95
+
96
+
97
+ # ---- Gradio v5 UI (Blocks) ----
98
+ with gr.Blocks(title="Group 5 Study Helper (RAG)") as demo:
99
+ gr.Markdown("# Group 5 Study Helper (RAG)\nUpload PDFs → Build Index → Ask questions.")
100
+
101
+ with gr.Row():
102
+ file_in = gr.Files(file_types=[".pdf"], label="Upload PDF files")
103
+ with gr.Row():
104
+ build_btn = gr.Button("Build Index", variant="primary")
105
+ status = gr.Markdown()
106
+ chunk_count = gr.Number(label="Chunk count", interactive=False)
107
+
108
+ with gr.Row():
109
+ question = gr.Textbox(label="Your question")
110
+ with gr.Row():
111
+ topk = gr.Slider(1, 10, value=5, step=1, label="Top-K passages")
112
+ max_tokens = gr.Slider(64, 512, value=256, step=16, label="Max new tokens")
113
+ with gr.Row():
114
+ ask_btn = gr.Button("Ask", variant="primary")
115
+ with gr.Row():
116
+ answer = gr.Markdown(label="Answer")
117
+
118
+ def _build(files):
119
+ msg, n = build_index(files)
120
+ return msg, n or 0
121
+
122
+ build_btn.click(_build, inputs=[file_in], outputs=[status, chunk_count])
123
+ ask_btn.click(answer_question, inputs=[question, topk, max_tokens], outputs=[answer])
124
+
125
+ demo.launch()