SamarthPujari commited on
Commit
5e6341f
·
verified ·
1 Parent(s): 5f65018

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -19
app.py CHANGED
@@ -82,56 +82,59 @@ qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
82
 
83
  @tool
84
  def document_qna_tool(pdf_path: str, question: str) -> str:
 
 
 
 
 
 
 
 
 
 
85
  import os, fitz, traceback
86
  from sentence_transformers import SentenceTransformer, util
87
  from transformers import pipeline
88
 
89
  try:
90
- # Step 0: Log input
91
- print(f"[DEBUG] Received pdf_path: {pdf_path}")
92
- print(f"[DEBUG] Received question: {question}")
93
 
94
- # Step 1: Check file exists
95
  if not os.path.exists(pdf_path):
96
- return f"[ERROR] File does not exist at {pdf_path}"
97
 
98
- # Step 2: Try opening PDF
99
  print("[DEBUG] Opening PDF...")
100
- doc = fitz.open(pdf_path)
 
 
 
101
 
102
- # Step 3: Extract text
103
- print("[DEBUG] Extracting text...")
104
  text_chunks = []
105
  for page in doc:
106
  text = page.get_text()
107
  if text.strip():
108
  text_chunks.append(text)
109
  doc.close()
110
- print(f"[DEBUG] Extracted {len(text_chunks)} chunks of text")
111
 
112
  if not text_chunks:
113
- return "[ERROR] No text found in the PDF."
 
 
114
 
115
- # Step 4: Load model
116
- print("[DEBUG] Loading embedding model...")
117
  embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
118
- print("[DEBUG] Encoding text...")
119
  embeddings = embedding_model.encode(text_chunks, convert_to_tensor=True)
120
  question_embedding = embedding_model.encode(question, convert_to_tensor=True)
121
 
122
- # Step 5: Semantic search
123
  print("[DEBUG] Performing semantic search...")
124
  scores = util.pytorch_cos_sim(question_embedding, embeddings)[0]
125
  best_match_idx = scores.argmax().item()
126
  best_context = text_chunks[best_match_idx]
127
- print(f"[DEBUG] Found best context index: {best_match_idx}")
128
 
129
- # Step 6: Answer question
130
- print("[DEBUG] Loading QA model...")
131
  qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
132
  prompt = f"Context: {best_context}\nQuestion: {question}"
133
- print(f"[DEBUG] Prompting model...")
134
  answer = qa_pipeline(prompt, max_new_tokens=100)[0]['generated_text']
 
135
  return f"Answer: {answer.strip()}"
136
 
137
  except Exception as e:
 
82
 
83
  @tool
84
  def document_qna_tool(pdf_path: str, question: str) -> str:
85
+ """
86
+ A tool that answers natural language questions about a given PDF document.
87
+
88
+ Args:
89
+ pdf_path (str): Path to the local PDF file.
90
+ question (str): Question about the content of the PDF.
91
+
92
+ Returns:
93
+ str: Answer to the question based on the content.
94
+ """
95
  import os, fitz, traceback
96
  from sentence_transformers import SentenceTransformer, util
97
  from transformers import pipeline
98
 
99
  try:
100
+ print(f"[DEBUG] PDF Path: {pdf_path}")
101
+ print(f"[DEBUG] Question: {question}")
 
102
 
 
103
  if not os.path.exists(pdf_path):
104
+ return f"[ERROR] File not found: {pdf_path}"
105
 
 
106
  print("[DEBUG] Opening PDF...")
107
+ try:
108
+ doc = fitz.open(pdf_path)
109
+ except RuntimeError as e:
110
+ return f"[ERROR] Could not open PDF. It may be corrupted or encrypted. Details: {str(e)}"
111
 
 
 
112
  text_chunks = []
113
  for page in doc:
114
  text = page.get_text()
115
  if text.strip():
116
  text_chunks.append(text)
117
  doc.close()
 
118
 
119
  if not text_chunks:
120
+ return "[ERROR] No readable text in the PDF."
121
+
122
+ print(f"[DEBUG] Extracted {len(text_chunks)} text chunks.")
123
 
 
 
124
  embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 
125
  embeddings = embedding_model.encode(text_chunks, convert_to_tensor=True)
126
  question_embedding = embedding_model.encode(question, convert_to_tensor=True)
127
 
 
128
  print("[DEBUG] Performing semantic search...")
129
  scores = util.pytorch_cos_sim(question_embedding, embeddings)[0]
130
  best_match_idx = scores.argmax().item()
131
  best_context = text_chunks[best_match_idx]
 
132
 
 
 
133
  qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
134
  prompt = f"Context: {best_context}\nQuestion: {question}"
135
+ print("[DEBUG] Calling QA model...")
136
  answer = qa_pipeline(prompt, max_new_tokens=100)[0]['generated_text']
137
+
138
  return f"Answer: {answer.strip()}"
139
 
140
  except Exception as e: