Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import gradio as gr
|
|
| 2 |
import fitz # PyMuPDF
|
| 3 |
import torch
|
| 4 |
from transformers import pipeline
|
| 5 |
-
import time, logging, re
|
| 6 |
import matplotlib
|
| 7 |
matplotlib.use('Agg')
|
| 8 |
import matplotlib.pyplot as plt
|
|
@@ -62,24 +62,20 @@ def summarize_file(file_bytes):
|
|
| 62 |
if not text.strip():
|
| 63 |
return "❌ No text found", None
|
| 64 |
|
| 65 |
-
text = text[:
|
| 66 |
-
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
|
| 67 |
summaries = []
|
| 68 |
|
| 69 |
for i, chunk in enumerate(chunks):
|
| 70 |
chunk_start = time.time()
|
| 71 |
chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
|
| 72 |
|
| 73 |
-
if time.time() - start > 20:
|
| 74 |
-
summaries.append("⚠️ Stopped early")
|
| 75 |
-
break
|
| 76 |
-
|
| 77 |
if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
|
| 78 |
summaries.append(f"### Chunk {i+1}: Skipped (equation-heavy)")
|
| 79 |
chunk_result['status'] = 'skipped'
|
| 80 |
else:
|
| 81 |
try:
|
| 82 |
-
summary = summarizer(chunk, max_length=
|
| 83 |
summaries.append(f"### Chunk {i+1}\n{summary}")
|
| 84 |
chunk_result['status'] = 'summarized'
|
| 85 |
except Exception as e:
|
|
@@ -98,12 +94,16 @@ def summarize_file(file_bytes):
|
|
| 98 |
image = visualize_chunk_status(chunk_info)
|
| 99 |
return final_summary, image
|
| 100 |
|
| 101 |
-
def find_relevant_passages(text, question, num_passages=
|
| 102 |
passages = re.split(r'(?<=[.?!])\s+', text)
|
| 103 |
scored = []
|
|
|
|
| 104 |
for passage in passages:
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
| 107 |
scored.sort(reverse=True)
|
| 108 |
best_passages = " ".join([p for _, p in scored[:num_passages]])
|
| 109 |
return best_passages
|
|
@@ -114,7 +114,7 @@ def answer_question(file_bytes, question):
|
|
| 114 |
text = "".join(page.get_text("text") for page in doc)
|
| 115 |
text = re.sub(r"\s+", " ", text).strip()
|
| 116 |
text = "".join(c for c in text if ord(c) < 128)
|
| 117 |
-
context = text[:
|
| 118 |
except Exception as e:
|
| 119 |
return f"❌ Text extraction failed: {str(e)}"
|
| 120 |
|
|
@@ -161,3 +161,4 @@ if __name__ == "__main__":
|
|
| 161 |
).launch(server_port=7860)
|
| 162 |
except Exception as e:
|
| 163 |
print(f"❌ Gradio launch failed: {str(e)}")
|
|
|
|
|
|
| 2 |
import fitz # PyMuPDF
|
| 3 |
import torch
|
| 4 |
from transformers import pipeline
|
| 5 |
+
import time, logging, re, difflib
|
| 6 |
import matplotlib
|
| 7 |
matplotlib.use('Agg')
|
| 8 |
import matplotlib.pyplot as plt
|
|
|
|
| 62 |
if not text.strip():
|
| 63 |
return "❌ No text found", None
|
| 64 |
|
| 65 |
+
text = text[:300000] # allow full but reasonable size
|
| 66 |
+
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
|
| 67 |
summaries = []
|
| 68 |
|
| 69 |
for i, chunk in enumerate(chunks):
|
| 70 |
chunk_start = time.time()
|
| 71 |
chunk_result = {'chunk': i + 1, 'status': '', 'time': 0}
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5:
|
| 74 |
summaries.append(f"### Chunk {i+1}: Skipped (equation-heavy)")
|
| 75 |
chunk_result['status'] = 'skipped'
|
| 76 |
else:
|
| 77 |
try:
|
| 78 |
+
summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
|
| 79 |
summaries.append(f"### Chunk {i+1}\n{summary}")
|
| 80 |
chunk_result['status'] = 'summarized'
|
| 81 |
except Exception as e:
|
|
|
|
| 94 |
image = visualize_chunk_status(chunk_info)
|
| 95 |
return final_summary, image
|
| 96 |
|
| 97 |
+
def find_relevant_passages(text, question, num_passages=5):
|
| 98 |
passages = re.split(r'(?<=[.?!])\s+', text)
|
| 99 |
scored = []
|
| 100 |
+
question_tokens = set(question.lower().split())
|
| 101 |
for passage in passages:
|
| 102 |
+
passage_tokens = set(passage.lower().split())
|
| 103 |
+
match_score = len(question_tokens.intersection(passage_tokens))
|
| 104 |
+
if match_score == 0:
|
| 105 |
+
match_score = difflib.SequenceMatcher(None, question, passage).ratio()
|
| 106 |
+
scored.append((match_score, passage))
|
| 107 |
scored.sort(reverse=True)
|
| 108 |
best_passages = " ".join([p for _, p in scored[:num_passages]])
|
| 109 |
return best_passages
|
|
|
|
| 114 |
text = "".join(page.get_text("text") for page in doc)
|
| 115 |
text = re.sub(r"\s+", " ", text).strip()
|
| 116 |
text = "".join(c for c in text if ord(c) < 128)
|
| 117 |
+
context = text[:300000]
|
| 118 |
except Exception as e:
|
| 119 |
return f"❌ Text extraction failed: {str(e)}"
|
| 120 |
|
|
|
|
| 161 |
).launch(server_port=7860)
|
| 162 |
except Exception as e:
|
| 163 |
print(f"❌ Gradio launch failed: {str(e)}")
|
| 164 |
+
|