exampletwo / app.py
tejovanth's picture
Update app.py
b26f983 verified
import gradio as gr
import fitz
import torch
from transformers import pipeline
import time, logging, re
logging.basicConfig(level=logging.ERROR)
device = -1 # CPU-only
print("⚠️ CPU-only. Expect ~20–30s for 300,000 chars.")
try:
summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32)
except Exception as e:
print(f"❌ Model loading failed: {str(e)}")
exit(1)
def summarize_file(file_bytes):
start = time.time()
print(f"File type: {type(file_bytes)}")
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
text = "".join(page.get_text("text") for page in doc)
# Clean OCR noise: replace LaTeX, remove excessive whitespace, non-ASCII
text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text) # Strip $...$
text = re.sub(r"\\cap", "intersection", text) # Handle ∩
text = re.sub(r"\s+", " ", text).strip() # Normalize whitespace
text = "".join(c for c in text if ord(c) < 128) # ASCII only
print(f"Extracted chars: {len(text)}")
except Exception as e:
return f"❌ Text extraction failed: {str(e)}"
if not text.strip(): return "❌ No text found"
text = text[:300000]
chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
print(f"Chunks created: {len(chunks)}")
if not chunks: return "❌ No chunks to summarize"
summaries = []
for i, chunk in enumerate(chunks):
if time.time() - start > 20:
summaries.append("⚠️ Stopped early")
break
if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5: # Skip equation-heavy chunks
summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
continue
try:
summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
summaries.append(f"**Chunk {i+1}**:\n{summary}")
except Exception as e:
summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries)
demo = gr.Interface(
fn=summarize_file, inputs=gr.File(label="πŸ“„ PDF/TXT Notes", type="binary"),
outputs=gr.Textbox(label="πŸ“ Summary"),
title="Fast Summarizer", description="300,000+ chars in ~20–30s (CPU)"
)
if __name__ == "__main__":
try:
demo.launch(share=False, server_port=7860)
except Exception as e:
print(f"❌ Gradio launch failed: {str(e)}")