Spaces:

tejovanth
/

example_five

Sleeping

App Files Files Community

tejovanth commited on Apr 28, 2025

Commit

9b9ef33

verified ·

1 Parent(s): 18c46de

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -12

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import fitz  # PyMuPDF
 import torch
 from transformers import pipeline
-import time, logging, re
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
@@ -62,24 +62,20 @@ def summarize_file(file_bytes):
     if not text.strip():
         return "❌ No text found", None
-    text = text[:200000]  # reduced slightly for speed
-    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)][:5]  # smaller + more chunks
     summaries = []
     for i, chunk in enumerate(chunks):
         chunk_start = time.time()
         chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
-        if time.time() - start > 20:
-            summaries.append("⚠️ Stopped early")
-            break
         if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
             summaries.append(f"### Chunk {i+1}: Skipped (equation-heavy)")
             chunk_result['status'] = 'skipped'
         else:
             try:
-                summary = summarizer(chunk, max_length=50, min_length=10, do_sample=False)[0]['summary_text']
                 summaries.append(f"### Chunk {i+1}\n{summary}")
                 chunk_result['status'] = 'summarized'
             except Exception as e:
@@ -98,12 +94,16 @@ def summarize_file(file_bytes):
     image = visualize_chunk_status(chunk_info)
     return final_summary, image
-def find_relevant_passages(text, question, num_passages=3):
     passages = re.split(r'(?<=[.?!])\s+', text)
     scored = []
     for passage in passages:
-        score = sum(1 for word in question.lower().split() if word in passage.lower())
-        scored.append((score, passage))
     scored.sort(reverse=True)
     best_passages = " ".join([p for _, p in scored[:num_passages]])
     return best_passages
@@ -114,7 +114,7 @@ def answer_question(file_bytes, question):
         text = "".join(page.get_text("text") for page in doc)
         text = re.sub(r"\s+", " ", text).strip()
         text = "".join(c for c in text if ord(c) < 128)
-        context = text[:200000]
     except Exception as e:
         return f"❌ Text extraction failed: {str(e)}"
@@ -161,3 +161,4 @@ if __name__ == "__main__":
         ).launch(server_port=7860)
     except Exception as e:
         print(f"❌ Gradio launch failed: {str(e)}")

 import fitz  # PyMuPDF
 import torch
 from transformers import pipeline
+import time, logging, re, difflib
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
     if not text.strip():
         return "❌ No text found", None
+    text = text[:300000]  # allow full but reasonable size
+    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
     summaries = []
     for i, chunk in enumerate(chunks):
         chunk_start = time.time()
         chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
         if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
             summaries.append(f"### Chunk {i+1}: Skipped (equation-heavy)")
             chunk_result['status'] = 'skipped'
         else:
             try:
+                summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
                 summaries.append(f"### Chunk {i+1}\n{summary}")
                 chunk_result['status'] = 'summarized'
             except Exception as e:
     image = visualize_chunk_status(chunk_info)
     return final_summary, image
+def find_relevant_passages(text, question, num_passages=5):
     passages = re.split(r'(?<=[.?!])\s+', text)
     scored = []
+    question_tokens = set(question.lower().split())
     for passage in passages:
+        passage_tokens = set(passage.lower().split())
+        match_score = len(question_tokens.intersection(passage_tokens))
+        if match_score == 0:
+            match_score = difflib.SequenceMatcher(None, question, passage).ratio()
+        scored.append((match_score, passage))
     scored.sort(reverse=True)
     best_passages = " ".join([p for _, p in scored[:num_passages]])
     return best_passages
         text = "".join(page.get_text("text") for page in doc)
         text = re.sub(r"\s+", " ", text).strip()
         text = "".join(c for c in text if ord(c) < 128)
+        context = text[:300000]
     except Exception as e:
         return f"❌ Text extraction failed: {str(e)}"
         ).launch(server_port=7860)
     except Exception as e:
         print(f"❌ Gradio launch failed: {str(e)}")