simran40 commited on
Commit
4e13ba0
·
verified ·
1 Parent(s): ca066e1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz
3
+ import re
4
+ import faiss
5
+ import torch
6
+ import numpy as np
7
+ from sentence_transformers import SentenceTransformer
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM
9
+
10
+ # -------- Load Models --------
11
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
12
+
13
+ llm_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
14
+ tokenizer = AutoTokenizer.from_pretrained(llm_name)
15
+ llm = AutoModelForCausalLM.from_pretrained(
16
+ llm_name,
17
+ torch_dtype=torch.float32
18
+ )
19
+
20
+ # -------- Helper Functions --------
21
+ def extract_text(pdf_file):
22
+ doc = fitz.open(pdf_file)
23
+ text = ""
24
+ for page in doc:
25
+ text += page.get_text()
26
+ return text
27
+
28
+ def clean_text(text):
29
+ return re.sub(r"\s+", " ", text)
30
+
31
+ def chunk_text(text, chunk_size=500, overlap=50):
32
+ chunks = []
33
+ start = 0
34
+ while start < len(text):
35
+ end = start + chunk_size
36
+ chunks.append(text[start:end])
37
+ start = end - overlap
38
+ return chunks
39
+
40
+ def build_vector_db(chunks):
41
+ embeddings = embedding_model.encode(chunks)
42
+ embeddings = np.array(embeddings).astype("float32")
43
+ index = faiss.IndexFlatL2(embeddings.shape[1])
44
+ index.add(embeddings)
45
+ return index, chunks
46
+
47
+ def retrieve_context(query, index, chunks, k=3):
48
+ q_emb = embedding_model.encode([query]).astype("float32")
49
+ _, indices = index.search(q_emb, k)
50
+ return [chunks[i] for i in indices[0]]
51
+
52
+ def generate_answer(question, context_chunks):
53
+ context = "\n\n".join(context_chunks)
54
+ prompt = f"""
55
+ Answer the question using ONLY the context below.
56
+ If not found, say "Information not found in the document."
57
+
58
+ Context:
59
+ {context}
60
+
61
+ Question:
62
+ {question}
63
+
64
+ Answer:
65
+ """
66
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
67
+ with torch.no_grad():
68
+ output = llm.generate(**inputs, max_new_tokens=200)
69
+ response = tokenizer.decode(output[0], skip_special_tokens=True)
70
+ return response.split("Answer:")[-1].strip()
71
+
72
+ # -------- Main Pipeline --------
73
+ def pdf_chat(pdf, question):
74
+ text = extract_text(pdf.name)
75
+ text = clean_text(text)
76
+ chunks = chunk_text(text)
77
+ index, chunks = build_vector_db(chunks)
78
+ context = retrieve_context(question, index, chunks)
79
+ return generate_answer(question, context)
80
+
81
+ # -------- Gradio UI --------
82
+ interface = gr.Interface(
83
+ fn=pdf_chat,
84
+ inputs=[
85
+ gr.File(label="Upload PDF"),
86
+ gr.Textbox(label="Ask a question")
87
+ ],
88
+ outputs=gr.Textbox(label="Answer"),
89
+ title="📄 PDF RAG Chatbot (Open-Source AI)",
90
+ description="Upload a PDF and ask questions. Runs on free CPU using Hugging Face open-source models."
91
+ )
92
+
93
+ interface.launch()