simran40 commited on
Commit
ead3409
·
verified ·
1 Parent(s): 4e13ba0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -41
app.py CHANGED
@@ -1,59 +1,95 @@
1
  import gradio as gr
2
- import fitz
3
  import re
4
  import faiss
5
  import torch
6
  import numpy as np
 
7
  from sentence_transformers import SentenceTransformer
8
  from transformers import AutoTokenizer, AutoModelForCausalLM
9
 
10
- # -------- Load Models --------
 
 
 
 
 
11
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
12
 
13
- llm_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
14
- tokenizer = AutoTokenizer.from_pretrained(llm_name)
 
 
15
  llm = AutoModelForCausalLM.from_pretrained(
16
- llm_name,
17
  torch_dtype=torch.float32
18
  )
19
 
20
- # -------- Helper Functions --------
21
- def extract_text(pdf_file):
22
- doc = fitz.open(pdf_file)
 
 
 
 
 
 
23
  text = ""
24
  for page in doc:
25
  text += page.get_text()
26
  return text
27
 
 
28
  def clean_text(text):
29
- return re.sub(r"\s+", " ", text)
 
30
 
31
  def chunk_text(text, chunk_size=500, overlap=50):
32
  chunks = []
33
  start = 0
34
- while start < len(text):
 
 
35
  end = start + chunk_size
36
  chunks.append(text[start:end])
37
  start = end - overlap
 
38
  return chunks
39
 
40
- def build_vector_db(chunks):
 
 
 
 
 
41
  embeddings = embedding_model.encode(chunks)
42
  embeddings = np.array(embeddings).astype("float32")
 
43
  index = faiss.IndexFlatL2(embeddings.shape[1])
44
  index.add(embeddings)
 
45
  return index, chunks
46
 
47
- def retrieve_context(query, index, chunks, k=3):
48
- q_emb = embedding_model.encode([query]).astype("float32")
49
- _, indices = index.search(q_emb, k)
 
 
50
  return [chunks[i] for i in indices[0]]
51
 
 
 
 
 
 
52
  def generate_answer(question, context_chunks):
53
  context = "\n\n".join(context_chunks)
 
54
  prompt = f"""
55
- Answer the question using ONLY the context below.
56
- If not found, say "Information not found in the document."
 
 
57
 
58
  Context:
59
  {context}
@@ -63,31 +99,101 @@ Question:
63
 
64
  Answer:
65
  """
 
66
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
 
67
  with torch.no_grad():
68
- output = llm.generate(**inputs, max_new_tokens=200)
69
- response = tokenizer.decode(output[0], skip_special_tokens=True)
70
- return response.split("Answer:")[-1].strip()
71
-
72
- # -------- Main Pipeline --------
73
- def pdf_chat(pdf, question):
74
- text = extract_text(pdf.name)
75
- text = clean_text(text)
76
- chunks = chunk_text(text)
77
- index, chunks = build_vector_db(chunks)
78
- context = retrieve_context(question, index, chunks)
79
- return generate_answer(question, context)
80
-
81
- # -------- Gradio UI --------
82
- interface = gr.Interface(
83
- fn=pdf_chat,
84
- inputs=[
85
- gr.File(label="Upload PDF"),
86
- gr.Textbox(label="Ask a question")
87
- ],
88
- outputs=gr.Textbox(label="Answer"),
89
- title="📄 PDF RAG Chatbot (Open-Source AI)",
90
- description="Upload a PDF and ask questions. Runs on free CPU using Hugging Face open-source models."
91
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- interface.launch()
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF
3
  import re
4
  import faiss
5
  import torch
6
  import numpy as np
7
+
8
  from sentence_transformers import SentenceTransformer
9
  from transformers import AutoTokenizer, AutoModelForCausalLM
10
 
11
+
12
+ # ===============================
13
+ # MODEL LOADING (ONCE)
14
+ # ===============================
15
+
16
+ # Embedding model (lightweight & fast)
17
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
18
 
19
+ # Open-source LLM (CPU friendly)
20
+ LLM_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
21
+
22
+ tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
23
  llm = AutoModelForCausalLM.from_pretrained(
24
+ LLM_NAME,
25
  torch_dtype=torch.float32
26
  )
27
 
28
+ llm.eval()
29
+
30
+
31
+ # ===============================
32
+ # PDF PROCESSING FUNCTIONS
33
+ # ===============================
34
+
35
+ def extract_text_from_pdf(pdf_path):
36
+ doc = fitz.open(pdf_path)
37
  text = ""
38
  for page in doc:
39
  text += page.get_text()
40
  return text
41
 
42
+
43
  def clean_text(text):
44
+ return re.sub(r"\s+", " ", text).strip()
45
+
46
 
47
  def chunk_text(text, chunk_size=500, overlap=50):
48
  chunks = []
49
  start = 0
50
+ text_length = len(text)
51
+
52
+ while start < text_length:
53
  end = start + chunk_size
54
  chunks.append(text[start:end])
55
  start = end - overlap
56
+
57
  return chunks
58
 
59
+
60
+ # ===============================
61
+ # VECTOR DATABASE (FAISS)
62
+ # ===============================
63
+
64
+ def build_faiss_index(chunks):
65
  embeddings = embedding_model.encode(chunks)
66
  embeddings = np.array(embeddings).astype("float32")
67
+
68
  index = faiss.IndexFlatL2(embeddings.shape[1])
69
  index.add(embeddings)
70
+
71
  return index, chunks
72
 
73
+
74
+ def retrieve_relevant_chunks(query, index, chunks, top_k=3):
75
+ query_embedding = embedding_model.encode([query]).astype("float32")
76
+ _, indices = index.search(query_embedding, top_k)
77
+
78
  return [chunks[i] for i in indices[0]]
79
 
80
+
81
+ # ===============================
82
+ # ANSWER GENERATION (LLM)
83
+ # ===============================
84
+
85
  def generate_answer(question, context_chunks):
86
  context = "\n\n".join(context_chunks)
87
+
88
  prompt = f"""
89
+ You are an AI assistant.
90
+ Answer the question strictly using the given context.
91
+ If the answer is not found, reply:
92
+ "Information not found in the document."
93
 
94
  Context:
95
  {context}
 
99
 
100
  Answer:
101
  """
102
+
103
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
104
+
105
  with torch.no_grad():
106
+ output = llm.generate(
107
+ **inputs,
108
+ max_new_tokens=200,
109
+ temperature=0.2
110
+ )
111
+
112
+ decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
113
+ return decoded_output.split("Answer:")[-1].strip()
114
+
115
+
116
+ # ===============================
117
+ # MAIN RAG PIPELINE
118
+ # ===============================
119
+
120
+ def pdf_rag_chat(pdf_file, question):
121
+ if pdf_file is None or question.strip() == "":
122
+ return "Please upload a PDF and enter a question."
123
+
124
+ # 1. Extract & preprocess text
125
+ raw_text = extract_text_from_pdf(pdf_file.name)
126
+ cleaned_text = clean_text(raw_text)
127
+
128
+ # 2. Chunking
129
+ chunks = chunk_text(cleaned_text)
130
+
131
+ # 3. Vector DB
132
+ index, chunks = build_faiss_index(chunks)
133
+
134
+ # 4. Retrieval
135
+ relevant_chunks = retrieve_relevant_chunks(question, index, chunks)
136
+
137
+ # 5. LLM Answer
138
+ return generate_answer(question, relevant_chunks)
139
+
140
+
141
+ # ===============================
142
+ # GRADIO UI (PRODUCTION READY)
143
+ # ===============================
144
+
145
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
146
+
147
+ gr.Markdown("""
148
+ # 📄 PDF RAG Chatbot (Open-Source AI)
149
+
150
+ Upload a **PDF document** and ask questions based **only on its content**.
151
+ This system uses a **Retrieval Augmented Generation (RAG)** architecture with
152
+ **open-source Hugging Face models**, running entirely on **free CPU**.
153
+
154
+ ---
155
+ """)
156
+
157
+ with gr.Row():
158
+ with gr.Column(scale=1):
159
+ pdf_input = gr.File(
160
+ label="📤 Upload PDF",
161
+ file_types=[".pdf"],
162
+ file_count="single"
163
+ )
164
+
165
+ question_input = gr.Textbox(
166
+ label="❓ Ask your question",
167
+ placeholder="e.g. What is the objective of the project?",
168
+ lines=2
169
+ )
170
+
171
+ submit_btn = gr.Button("🔍 Get Answer", variant="primary")
172
+
173
+ with gr.Column(scale=2):
174
+ answer_output = gr.Textbox(
175
+ label="📌 Answer",
176
+ lines=10,
177
+ show_copy_button=True
178
+ )
179
+
180
+ submit_btn.click(
181
+ fn=pdf_rag_chat,
182
+ inputs=[pdf_input, question_input],
183
+ outputs=answer_output
184
+ )
185
+
186
+ gr.Markdown("""
187
+ ---
188
+ ### ⚙️ System Information
189
+ - **LLM:** TinyLlama (Open-Source, Hugging Face)
190
+ - **Embeddings:** Sentence Transformers
191
+ - **Vector Store:** FAISS
192
+ - **Deployment:** Hugging Face Spaces (Free CPU)
193
+
194
+ ---
195
+ © **Simranpreet Kaur**
196
+ **NIELIT Ropar | AIML Six Months Training | 2026**
197
+ """)
198
 
199
+ demo.launch()