Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import gradio as gr
|
|
| 2 |
import fitz
|
| 3 |
import torch
|
| 4 |
from transformers import pipeline
|
| 5 |
-
import time, logging
|
| 6 |
|
| 7 |
logging.basicConfig(level=logging.ERROR)
|
| 8 |
device = -1 # CPU-only
|
|
@@ -18,7 +18,14 @@ def summarize_file(file_bytes):
|
|
| 18 |
start = time.time()
|
| 19 |
print(f"File type: {type(file_bytes)}")
|
| 20 |
try:
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
except Exception as e:
|
| 23 |
return f"❌ Text extraction failed: {str(e)}"
|
| 24 |
if not text.strip(): return "❌ No text found"
|
|
@@ -28,9 +35,12 @@ def summarize_file(file_bytes):
|
|
| 28 |
if not chunks: return "❌ No chunks to summarize"
|
| 29 |
summaries = []
|
| 30 |
for i, chunk in enumerate(chunks):
|
| 31 |
-
if time.time() - start >
|
| 32 |
summaries.append("⚠️ Stopped early")
|
| 33 |
break
|
|
|
|
|
|
|
|
|
|
| 34 |
try:
|
| 35 |
summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
|
| 36 |
summaries.append(f"**Chunk {i+1}**:\n{summary}")
|
|
|
|
| 2 |
import fitz
|
| 3 |
import torch
|
| 4 |
from transformers import pipeline
|
| 5 |
+
import time, logging, re
|
| 6 |
|
| 7 |
logging.basicConfig(level=logging.ERROR)
|
| 8 |
device = -1 # CPU-only
|
|
|
|
| 18 |
start = time.time()
|
| 19 |
print(f"File type: {type(file_bytes)}")
|
| 20 |
try:
|
| 21 |
+
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 22 |
+
text = "".join(page.get_text("text") for page in doc)
|
| 23 |
+
# Clean OCR noise: replace LaTeX, remove excessive whitespace, non-ASCII
|
| 24 |
+
text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text) # Strip $...$
|
| 25 |
+
text = re.sub(r"\\cap", "intersection", text) # Handle ∩
|
| 26 |
+
text = re.sub(r"\s+", " ", text).strip() # Normalize whitespace
|
| 27 |
+
text = "".join(c for c in text if ord(c) < 128) # ASCII only
|
| 28 |
+
print(f"Extracted chars: {len(text)}")
|
| 29 |
except Exception as e:
|
| 30 |
return f"❌ Text extraction failed: {str(e)}"
|
| 31 |
if not text.strip(): return "❌ No text found"
|
|
|
|
| 35 |
if not chunks: return "❌ No chunks to summarize"
|
| 36 |
summaries = []
|
| 37 |
for i, chunk in enumerate(chunks):
|
| 38 |
+
if time.time() - start > 20:
|
| 39 |
summaries.append("⚠️ Stopped early")
|
| 40 |
break
|
| 41 |
+
if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5: # Skip equation-heavy chunks
|
| 42 |
+
summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
|
| 43 |
+
continue
|
| 44 |
try:
|
| 45 |
summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
|
| 46 |
summaries.append(f"**Chunk {i+1}**:\n{summary}")
|