import gradio as gr import fitz import torch from transformers import pipeline import time, logging, re logging.basicConfig(level=logging.ERROR) device = -1 # CPU-only print("⚠️ CPU-only. Expect ~20–30s for 300,000 chars.") try: summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32) except Exception as e: print(f"❌ Model loading failed: {str(e)}") exit(1) def summarize_file(file_bytes): start = time.time() print(f"File type: {type(file_bytes)}") try: doc = fitz.open(stream=file_bytes, filetype="pdf") text = "".join(page.get_text("text") for page in doc) # Clean OCR noise: replace LaTeX, remove excessive whitespace, non-ASCII text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text) # Strip $...$ text = re.sub(r"\\cap", "intersection", text) # Handle ∩ text = re.sub(r"\s+", " ", text).strip() # Normalize whitespace text = "".join(c for c in text if ord(c) < 128) # ASCII only print(f"Extracted chars: {len(text)}") except Exception as e: return f"❌ Text extraction failed: {str(e)}" if not text.strip(): return "❌ No text found" text = text[:300000] chunks = [text[i:i+2000] for i in range(0, len(text), 2000)] print(f"Chunks created: {len(chunks)}") if not chunks: return "❌ No chunks to summarize" summaries = [] for i, chunk in enumerate(chunks): if time.time() - start > 20: summaries.append("⚠️ Stopped early") break if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5: # Skip equation-heavy chunks summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)") continue try: summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text'] summaries.append(f"**Chunk {i+1}**:\n{summary}") except Exception as e: summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}") return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries) demo = gr.Interface( fn=summarize_file, inputs=gr.File(label="📄 PDF/TXT Notes", type="binary"), outputs=gr.Textbox(label="📝 Summary"), title="Fast Summarizer", description="300,000+ chars in ~20–30s (CPU)" ) if __name__ == "__main__": try: demo.launch(share=False, server_port=7860) except Exception as e: print(f"❌ Gradio launch failed: {str(e)}")