First_agent_template

Sleeping

App Files Files Community

SamarthPujari commited on Jun 14, 2025

Commit

5e6341f

verified ·

1 Parent(s): 5f65018

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -19

app.py CHANGED Viewed

@@ -82,56 +82,59 @@ qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
 @tool
 def document_qna_tool(pdf_path: str, question: str) -> str:
     import os, fitz, traceback
     from sentence_transformers import SentenceTransformer, util
     from transformers import pipeline
     try:
-        # Step 0: Log input
-        print(f"[DEBUG] Received pdf_path: {pdf_path}")
-        print(f"[DEBUG] Received question: {question}")
-        # Step 1: Check file exists
         if not os.path.exists(pdf_path):
-            return f"[ERROR] File does not exist at {pdf_path}"
-        # Step 2: Try opening PDF
         print("[DEBUG] Opening PDF...")
-        doc = fitz.open(pdf_path)
-        # Step 3: Extract text
-        print("[DEBUG] Extracting text...")
         text_chunks = []
         for page in doc:
             text = page.get_text()
             if text.strip():
                 text_chunks.append(text)
         doc.close()
-        print(f"[DEBUG] Extracted {len(text_chunks)} chunks of text")
         if not text_chunks:
-            return "[ERROR] No text found in the PDF."
-        # Step 4: Load model
-        print("[DEBUG] Loading embedding model...")
         embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-        print("[DEBUG] Encoding text...")
         embeddings = embedding_model.encode(text_chunks, convert_to_tensor=True)
         question_embedding = embedding_model.encode(question, convert_to_tensor=True)
-        # Step 5: Semantic search
         print("[DEBUG] Performing semantic search...")
         scores = util.pytorch_cos_sim(question_embedding, embeddings)[0]
         best_match_idx = scores.argmax().item()
         best_context = text_chunks[best_match_idx]
-        print(f"[DEBUG] Found best context index: {best_match_idx}")
-        # Step 6: Answer question
-        print("[DEBUG] Loading QA model...")
         qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
         prompt = f"Context: {best_context}\nQuestion: {question}"
-        print(f"[DEBUG] Prompting model...")
         answer = qa_pipeline(prompt, max_new_tokens=100)[0]['generated_text']
         return f"Answer: {answer.strip()}"
     except Exception as e:

 @tool
 def document_qna_tool(pdf_path: str, question: str) -> str:
+    """
+    A tool that answers natural language questions about a given PDF document.
+    Args:
+        pdf_path (str): Path to the local PDF file.
+        question (str): Question about the content of the PDF.
+    Returns:
+        str: Answer to the question based on the content.
+    """
     import os, fitz, traceback
     from sentence_transformers import SentenceTransformer, util
     from transformers import pipeline
     try:
+        print(f"[DEBUG] PDF Path: {pdf_path}")
+        print(f"[DEBUG] Question: {question}")
         if not os.path.exists(pdf_path):
+            return f"[ERROR] File not found: {pdf_path}"
         print("[DEBUG] Opening PDF...")
+        try:
+            doc = fitz.open(pdf_path)
+        except RuntimeError as e:
+            return f"[ERROR] Could not open PDF. It may be corrupted or encrypted. Details: {str(e)}"
         text_chunks = []
         for page in doc:
             text = page.get_text()
             if text.strip():
                 text_chunks.append(text)
         doc.close()
         if not text_chunks:
+            return "[ERROR] No readable text in the PDF."
+        print(f"[DEBUG] Extracted {len(text_chunks)} text chunks.")
         embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
         embeddings = embedding_model.encode(text_chunks, convert_to_tensor=True)
         question_embedding = embedding_model.encode(question, convert_to_tensor=True)
         print("[DEBUG] Performing semantic search...")
         scores = util.pytorch_cos_sim(question_embedding, embeddings)[0]
         best_match_idx = scores.argmax().item()
         best_context = text_chunks[best_match_idx]
         qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
         prompt = f"Context: {best_context}\nQuestion: {question}"
+        print("[DEBUG] Calling QA model...")
         answer = qa_pipeline(prompt, max_new_tokens=100)[0]['generated_text']
         return f"Answer: {answer.strip()}"
     except Exception as e: