SamarthPujari commited on
Commit
5f65018
·
verified ·
1 Parent(s): e58d6bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -15
app.py CHANGED
@@ -82,44 +82,60 @@ qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
82
 
83
  @tool
84
  def document_qna_tool(pdf_path: str, question: str) -> str:
85
- """
86
- A tool for answering questions based on the content of a PDF document.
87
- Args:
88
- pdf_path (str): Path to the local PDF file.
89
- question (str): A natural language question to ask about the PDF content.
90
- Returns:
91
- str: Answer to the question based on the PDF's content.
92
- """
93
  try:
 
 
 
 
 
94
  if not os.path.exists(pdf_path):
95
- return f"Error: File not found at {pdf_path}"
96
 
97
- # Step 1: Extract text from PDF
 
98
  doc = fitz.open(pdf_path)
 
 
 
99
  text_chunks = []
100
  for page in doc:
101
  text = page.get_text()
102
  if text.strip():
103
  text_chunks.append(text)
104
  doc.close()
 
105
 
106
  if not text_chunks:
107
- return "No text found in the PDF."
108
 
109
- # Step 2: Semantic search
 
 
 
110
  embeddings = embedding_model.encode(text_chunks, convert_to_tensor=True)
111
  question_embedding = embedding_model.encode(question, convert_to_tensor=True)
 
 
 
112
  scores = util.pytorch_cos_sim(question_embedding, embeddings)[0]
113
- best_match_idx = scores.argmax()
114
  best_context = text_chunks[best_match_idx]
 
115
 
116
- # Step 3: Answer question
 
 
117
  prompt = f"Context: {best_context}\nQuestion: {question}"
 
118
  answer = qa_pipeline(prompt, max_new_tokens=100)[0]['generated_text']
119
  return f"Answer: {answer.strip()}"
120
 
121
  except Exception as e:
122
- return f"Error processing document QnA: {type(e).__name__}: {str(e)}\n{traceback.format_exc()}"
123
 
124
  # -------------------- Other Components --------------------
125
  final_answer = FinalAnswerTool()
 
82
 
83
  @tool
84
  def document_qna_tool(pdf_path: str, question: str) -> str:
85
+ import os, fitz, traceback
86
+ from sentence_transformers import SentenceTransformer, util
87
+ from transformers import pipeline
88
+
 
 
 
 
89
  try:
90
+ # Step 0: Log input
91
+ print(f"[DEBUG] Received pdf_path: {pdf_path}")
92
+ print(f"[DEBUG] Received question: {question}")
93
+
94
+ # Step 1: Check file exists
95
  if not os.path.exists(pdf_path):
96
+ return f"[ERROR] File does not exist at {pdf_path}"
97
 
98
+ # Step 2: Try opening PDF
99
+ print("[DEBUG] Opening PDF...")
100
  doc = fitz.open(pdf_path)
101
+
102
+ # Step 3: Extract text
103
+ print("[DEBUG] Extracting text...")
104
  text_chunks = []
105
  for page in doc:
106
  text = page.get_text()
107
  if text.strip():
108
  text_chunks.append(text)
109
  doc.close()
110
+ print(f"[DEBUG] Extracted {len(text_chunks)} chunks of text")
111
 
112
  if not text_chunks:
113
+ return "[ERROR] No text found in the PDF."
114
 
115
+ # Step 4: Load model
116
+ print("[DEBUG] Loading embedding model...")
117
+ embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
118
+ print("[DEBUG] Encoding text...")
119
  embeddings = embedding_model.encode(text_chunks, convert_to_tensor=True)
120
  question_embedding = embedding_model.encode(question, convert_to_tensor=True)
121
+
122
+ # Step 5: Semantic search
123
+ print("[DEBUG] Performing semantic search...")
124
  scores = util.pytorch_cos_sim(question_embedding, embeddings)[0]
125
+ best_match_idx = scores.argmax().item()
126
  best_context = text_chunks[best_match_idx]
127
+ print(f"[DEBUG] Found best context index: {best_match_idx}")
128
 
129
+ # Step 6: Answer question
130
+ print("[DEBUG] Loading QA model...")
131
+ qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
132
  prompt = f"Context: {best_context}\nQuestion: {question}"
133
+ print(f"[DEBUG] Prompting model...")
134
  answer = qa_pipeline(prompt, max_new_tokens=100)[0]['generated_text']
135
  return f"Answer: {answer.strip()}"
136
 
137
  except Exception as e:
138
+ return f"[EXCEPTION] {type(e).__name__}: {str(e)}\n{traceback.format_exc()}"
139
 
140
  # -------------------- Other Components --------------------
141
  final_answer = FinalAnswerTool()