chippyjolly commited on
Commit
66dcec5
·
verified ·
1 Parent(s): a661314

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -187
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import gradio as gr
3
  from PyPDF2 import PdfReader
@@ -5,51 +6,62 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
5
  from langchain_community.vectorstores import FAISS
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
 
 
8
  from langchain.chains.retrieval_qa.base import RetrievalQA
9
  from langchain.prompts import PromptTemplate
10
  from langchain_core.language_models.llms import LLM
11
  from langchain_core.callbacks import CallbackManagerForLLMRun
12
 
 
13
  from typing import Optional, List, Dict, Any
14
  from dotenv import load_dotenv
15
  from groq import Groq
16
 
 
17
  import urllib.parse
18
  import feedparser
19
 
 
20
  from numpy import dot
21
  from numpy.linalg import norm
22
 
 
23
  # Load environment variables
24
  load_dotenv()
25
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
26
 
27
 
 
 
28
  # -----------------------------------------------------------
29
  # GROQ WRAPPER
30
  # -----------------------------------------------------------
31
  class GroqWrapper(LLM):
32
- client: Any
33
- model_name: str = "llama-3.3-70b-versatile"
34
- temperature: float = 0.7
35
-
36
- @property
37
- def _llm_type(self) -> str:
38
- return "groq"
39
-
40
- def _call(
41
- self,
42
- prompt: str,
43
- stop: Optional[List[str]] = None,
44
- run_manager: Optional[CallbackManagerForLLMRun] = None,
45
- **kwargs: Any,
46
- ) -> str:
47
- response = self.client.chat.completions.create(
48
- model=self.model_name,
49
- messages=[{"role": "user", "content": prompt}],
50
- temperature=self.temperature,
51
- )
52
- return response.choices[0].message.content
 
 
 
 
53
 
54
 
55
  # Globals
@@ -58,40 +70,47 @@ qa_chain = None
58
  groq_llm = None
59
 
60
 
 
 
61
  # -----------------------------------------------------------
62
  # PROCESS PDF
63
  # -----------------------------------------------------------
64
  def upload_pdf(file):
65
- global vectorstore, qa_chain, groq_llm
66
-
67
- try:
68
- # Initialize Groq LLM
69
- if groq_llm is None:
70
- groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
71
-
72
- # Extract text from PDF
73
- text = "".join(page.extract_text() or "" for page in PdfReader(file).pages)
74
- if not text.strip():
75
- return "Error: No readable text found in PDF"
76
-
77
- # Chunk the text
78
- splitter = RecursiveCharacterTextSplitter(
79
- chunk_size=1000,
80
- chunk_overlap=150,
81
- separators=["\n\n", "\n", ".", "?", "!"]
82
- )
83
- chunks = splitter.split_text(text)
84
-
85
- # Create Vectorstore
86
- embeddings = HuggingFaceEmbeddings(
87
- model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
88
- )
89
- vectorstore = FAISS.from_texts(chunks, embeddings)
90
-
91
- # --- CUSTOM REFINE PROMPTS ---
92
- initial_prompt = PromptTemplate(
93
- input_variables=["context", "question"],
94
- template="""
 
 
 
 
 
95
  You are an expert researcher.
96
 
97
  Use ONLY the given context to answer the question.
@@ -104,12 +123,13 @@ Question: {question}
104
 
105
  Initial Answer:
106
  """
107
- )
 
108
 
109
- refine_prompt = PromptTemplate(
110
- input_variables=["context", "question", "existing_answer"],
111
- template="""
112
- We have an existing answer:
113
  {existing_answer}
114
 
115
  Using the additional context below, refine the answer.
@@ -121,71 +141,87 @@ Question: {question}
121
 
122
  Refined Answer:
123
  """
124
- )
125
-
126
- # --- BUILD QA CHAIN ---
127
- qa_chain = RetrievalQA.from_chain_type(
128
- llm=groq_llm,
129
- retriever=vectorstore.as_retriever(),
130
- chain_type="refine",
131
- return_source_documents=True,
132
- chain_type_kwargs={
133
- "question_prompt": initial_prompt,
134
- "refine_prompt": refine_prompt,
135
- "document_variable_name": "context" # <-- ADD THIS LINE
136
- }
 
137
  )
138
 
139
 
140
 
141
- return "PDF processed successfully!"
142
 
143
- except Exception as e:
144
- return f"Error: {str(e)}"
 
 
 
 
 
 
 
145
 
146
 
147
  # -----------------------------------------------------------
148
  # QUESTION ANSWERING
149
  # -----------------------------------------------------------
150
  def ask_question(query):
151
- global qa_chain
 
 
 
 
 
152
 
153
- if qa_chain is None:
154
- return "Please upload a PDF first.", ""
 
155
 
156
- try:
157
- result = qa_chain({"query": query})
158
- answer = result["result"]
159
 
160
- # Format sources
161
- sources = result.get("source_documents", [])
162
- if sources:
163
- source_text = "\n\n---\n".join(
164
- f"Source {i+1}:\n{doc.page_content[:500]}..."
165
- for i, doc in enumerate(sources)
166
- )
167
- else:
168
- source_text = "No sources found."
 
 
 
 
 
 
 
169
 
170
- return answer, source_text
171
 
172
- except Exception as e:
173
- return f"Error: {str(e)}", ""
174
 
175
 
176
  # -----------------------------------------------------------
177
  # SUMMARIZE PDF
178
  # -----------------------------------------------------------
179
  def summarize_pdf(num_points=6):
180
- global groq_llm, vectorstore
181
- if vectorstore is None:
182
- return "Please upload a PDF first."
 
183
 
184
- try:
185
- docs = vectorstore.similarity_search("summary", k=5)
186
- context = "\n\n".join([d.page_content for d in docs])
187
 
188
- prompt = f"""
 
189
  Summarize the research paper in {num_points} bullet points.
190
  Make it clear, meaningful, and highlight key contributions.
191
 
@@ -195,107 +231,97 @@ Content:
195
  Summary:
196
  """
197
 
198
- if groq_llm is None:
199
- groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
200
 
201
- return groq_llm(prompt).strip()
 
 
 
 
 
 
 
 
 
202
 
203
- except Exception as e:
204
- return f"Error: {str(e)}"
205
 
206
 
207
  # -----------------------------------------------------------
208
  # FIND SIMILAR PAPERS (arXiv)
209
  # -----------------------------------------------------------
210
- def extract_title_and_abstract(text):
211
- lines = text.split("\n")
212
- title = lines[0].strip()
213
 
214
- abstract = ""
215
- for i, line in enumerate(lines):
216
- if "abstract" in line.lower():
217
- # Take next 8–12 lines as abstract
218
- abstract = " ".join(lines[i+1:i+10])
219
- break
220
 
221
- return title, abstract
 
222
 
223
 
224
- def find_similar_papers():
225
- global vectorstore
226
-
227
- if vectorstore is None:
228
- return "Please upload a PDF first."
229
-
230
- try:
231
- # Get full PDF text from all chunks
232
- docs = vectorstore.similarity_search("", k=30)
233
- full_pdf_text = " ".join(d.page_content for d in docs)
234
-
235
- if not full_pdf_text.strip():
236
- return "PDF content too small."
237
-
238
- # ----------------------------
239
- # 1️⃣ Extract title + abstract
240
- # ----------------------------
241
- title, abstract = extract_title_and_abstract(full_pdf_text)
242
- query_text = f"{title}. {abstract}"
243
-
244
- # ----------------------------
245
- # 2️⃣ Search arXiv with a real query
246
- # ----------------------------
247
- encoded_query = urllib.parse.quote(query_text)
248
- url = f"http://export.arxiv.org/api/query?search_query=all:{encoded_query}&start=0&max_results=15"
249
-
250
- feed = feedparser.parse(url)
251
- entries = feed.entries
252
-
253
- if not entries:
254
- return "No similar papers found on arXiv."
255
-
256
- # ----------------------------
257
- # 3️⃣ Better embeddings for similarity
258
- # ----------------------------
259
- embedding_model = HuggingFaceEmbeddings(
260
- model_name="sentence-transformers/all-mpnet-base-v2"
261
- )
262
-
263
- query_emb = embedding_model.embed_query(query_text)
264
-
265
- ranked = []
266
- for entry in entries:
267
- candidate_text = f"{entry.title} {entry.summary}"
268
- emb = embedding_model.embed_query(candidate_text)
269
-
270
- sim = dot(query_emb, emb) / (norm(query_emb) * norm(emb))
271
-
272
- ranked.append({
273
- "title": entry.title,
274
- "summary": entry.summary.replace("\n", " ").strip(),
275
- "link": entry.link,
276
- "similarity": sim
277
- })
278
-
279
- # Sort by similarity
280
- ranked.sort(key=lambda x: x["similarity"], reverse=True)
281
-
282
- # ----------------------------
283
- # 4️⃣ Format top 3 results
284
- # ----------------------------
285
- output = []
286
- for p in ranked[:3]:
287
- out = (
288
- f"**{p['title']}**\n"
289
- f"{p['summary']}\n"
290
- f"🔗 {p['link']}\n"
291
- f"Similarity Score: {p['similarity']:.2f}"
292
- )
293
- output.append(out)
294
-
295
- return "\n\n".join(output)
296
-
297
- except Exception as e:
298
- return f"Error: {str(e)}"
299
 
300
 
301
 
 
1
+
2
  import os
3
  import gradio as gr
4
  from PyPDF2 import PdfReader
 
6
  from langchain_community.vectorstores import FAISS
7
  from langchain_community.embeddings import HuggingFaceEmbeddings
8
 
9
+
10
  from langchain.chains.retrieval_qa.base import RetrievalQA
11
  from langchain.prompts import PromptTemplate
12
  from langchain_core.language_models.llms import LLM
13
  from langchain_core.callbacks import CallbackManagerForLLMRun
14
 
15
+
16
  from typing import Optional, List, Dict, Any
17
  from dotenv import load_dotenv
18
  from groq import Groq
19
 
20
+
21
  import urllib.parse
22
  import feedparser
23
 
24
+
25
  from numpy import dot
26
  from numpy.linalg import norm
27
 
28
+
29
  # Load environment variables
30
  load_dotenv()
31
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
32
 
33
 
34
+
35
+
36
  # -----------------------------------------------------------
37
  # GROQ WRAPPER
38
  # -----------------------------------------------------------
39
  class GroqWrapper(LLM):
40
+ client: Any
41
+ model_name: str = "llama-3.3-70b-versatile"
42
+ temperature: float = 0.7
43
+
44
+
45
+ @property
46
+ def _llm_type(self) -> str:
47
+ return "groq"
48
+
49
+
50
+ def _call(
51
+ self,
52
+ prompt: str,
53
+ stop: Optional[List[str]] = None,
54
+ run_manager: Optional[CallbackManagerForLLMRun] = None,
55
+ **kwargs: Any,
56
+ ) -> str:
57
+ response = self.client.chat.completions.create(
58
+ model=self.model_name,
59
+ messages=[{"role": "user", "content": prompt}],
60
+ temperature=self.temperature,
61
+ )
62
+ return response.choices[0].message.content
63
+
64
+
65
 
66
 
67
  # Globals
 
70
  groq_llm = None
71
 
72
 
73
+
74
+
75
  # -----------------------------------------------------------
76
  # PROCESS PDF
77
  # -----------------------------------------------------------
78
  def upload_pdf(file):
79
+ global vectorstore, qa_chain, groq_llm
80
+
81
+
82
+ try:
83
+ # Initialize Groq LLM
84
+ if groq_llm is None:
85
+ groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
86
+
87
+
88
+ # Extract text from PDF
89
+ text = "".join(page.extract_text() or "" for page in PdfReader(file).pages)
90
+ if not text.strip():
91
+ return "Error: No readable text found in PDF"
92
+
93
+
94
+ # Chunk the text
95
+ splitter = RecursiveCharacterTextSplitter(
96
+ chunk_size=1000,
97
+ chunk_overlap=150,
98
+ separators=["\n\n", "\n", ".", "?", "!"]
99
+ )
100
+ chunks = splitter.split_text(text)
101
+
102
+
103
+ # Create Vectorstore
104
+ embeddings = HuggingFaceEmbeddings(
105
+ model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
106
+ )
107
+ vectorstore = FAISS.from_texts(chunks, embeddings)
108
+
109
+
110
+ # --- CUSTOM REFINE PROMPTS ---
111
+ initial_prompt = PromptTemplate(
112
+ input_variables=["context", "question"],
113
+ template="""
114
  You are an expert researcher.
115
 
116
  Use ONLY the given context to answer the question.
 
123
 
124
  Initial Answer:
125
  """
126
+ )
127
+
128
 
129
+ refine_prompt = PromptTemplate(
130
+ input_variables=["context", "question", "existing_answer"],
131
+ template="""
132
+ We have an existing answer:
133
  {existing_answer}
134
 
135
  Using the additional context below, refine the answer.
 
141
 
142
  Refined Answer:
143
  """
144
+ )
145
+
146
+
147
+ # --- BUILD QA CHAIN ---
148
+ qa_chain = RetrievalQA.from_chain_type(
149
+ llm=groq_llm,
150
+ retriever=vectorstore.as_retriever(),
151
+ chain_type="refine",
152
+ return_source_documents=True,
153
+ chain_type_kwargs={
154
+ "question_prompt": initial_prompt,
155
+ "refine_prompt": refine_prompt,
156
+ "document_variable_name": "context" # <-- ADD THIS LINE
157
+ }
158
  )
159
 
160
 
161
 
 
162
 
163
+
164
+
165
+ return "PDF processed successfully!"
166
+
167
+
168
+ except Exception as e:
169
+ return f"Error: {str(e)}"
170
+
171
+
172
 
173
 
174
  # -----------------------------------------------------------
175
  # QUESTION ANSWERING
176
  # -----------------------------------------------------------
177
  def ask_question(query):
178
+ global qa_chain
179
+
180
+
181
+ if qa_chain is None:
182
+ return "Please upload a PDF first.", ""
183
+
184
 
185
+ try:
186
+ result = qa_chain({"query": query})
187
+ answer = result["result"]
188
 
 
 
 
189
 
190
+ # Format sources
191
+ sources = result.get("source_documents", [])
192
+ if sources:
193
+ source_text = "\n\n---\n".join(
194
+ f"Source {i+1}:\n{doc.page_content[:500]}..."
195
+ for i, doc in enumerate(sources)
196
+ )
197
+ else:
198
+ source_text = "No sources found."
199
+
200
+
201
+ return answer, source_text
202
+
203
+
204
+ except Exception as e:
205
+ return f"Error: {str(e)}", ""
206
 
 
207
 
 
 
208
 
209
 
210
  # -----------------------------------------------------------
211
  # SUMMARIZE PDF
212
  # -----------------------------------------------------------
213
  def summarize_pdf(num_points=6):
214
+ global groq_llm, vectorstore
215
+ if vectorstore is None:
216
+ return "Please upload a PDF first."
217
+
218
 
219
+ try:
220
+ docs = vectorstore.similarity_search("summary", k=5)
221
+ context = "\n\n".join([d.page_content for d in docs])
222
 
223
+
224
+ prompt = f"""
225
  Summarize the research paper in {num_points} bullet points.
226
  Make it clear, meaningful, and highlight key contributions.
227
 
 
231
  Summary:
232
  """
233
 
 
 
234
 
235
+ if groq_llm is None:
236
+ groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
237
+
238
+
239
+ return groq_llm(prompt).strip()
240
+
241
+
242
+ except Exception as e:
243
+ return f"Error: {str(e)}"
244
+
245
 
 
 
246
 
247
 
248
  # -----------------------------------------------------------
249
  # FIND SIMILAR PAPERS (arXiv)
250
  # -----------------------------------------------------------
251
+ def find_similar_papers():
252
+ global vectorstore
 
253
 
 
 
 
 
 
 
254
 
255
+ if vectorstore is None:
256
+ return "Please upload a PDF first."
257
 
258
 
259
+ try:
260
+ # Get content from PDF
261
+ top_chunks = vectorstore.similarity_search("", k=5)
262
+ pdf_text = " ".join(doc.page_content for doc in top_chunks)
263
+
264
+
265
+ if not pdf_text.strip():
266
+ return "PDF content too small."
267
+
268
+
269
+ # Extract keywords
270
+ keywords = " ".join(pdf_text.split()[:20])
271
+ encoded = urllib.parse.quote(keywords)
272
+ url = f"http://export.arxiv.org/api/query?search_query=all:{encoded}&start=0&max_results=5"
273
+
274
+
275
+ feed = feedparser.parse(url)
276
+ entries = feed.entries
277
+
278
+
279
+ if not entries:
280
+ return "No arXiv results found."
281
+
282
+
283
+ # Embeddings for ranking
284
+ embedding_model = HuggingFaceEmbeddings(
285
+ model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
286
+ )
287
+ pdf_emb = embedding_model.embed_query(pdf_text)
288
+
289
+
290
+ results = []
291
+ for entry in entries:
292
+ txt = f"{entry.title} {entry.summary}"
293
+ emb = embedding_model.embed_query(txt)
294
+ sim = dot(pdf_emb, emb) / (norm(pdf_emb) * norm(emb))
295
+
296
+
297
+ results.append({
298
+ "title": entry.title,
299
+ "summary": entry.summary.replace("\n", " ").strip(),
300
+ "link": entry.link,
301
+ "similarity": sim
302
+ })
303
+
304
+
305
+ # Sort by similarity DESC
306
+ results.sort(key=lambda x: x["similarity"], reverse=True)
307
+
308
+
309
+ formatted = []
310
+ for paper in results[:3]:
311
+ formatted.append(
312
+ f"**{paper['title']}**\n"
313
+ f"{paper['summary']}\n"
314
+ f"🔗 {paper['link']}\n"
315
+ f"Similarity Score: {paper['similarity']:.2f}"
316
+ )
317
+
318
+
319
+ return "\n\n".join(formatted)
320
+
321
+
322
+ except Exception as e:
323
+ return f"Error: {str(e)}"
324
+
 
 
 
 
 
 
 
 
 
325
 
326
 
327