import gradio as gr import fitz import torch from transformers import pipeline import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes from PIL import Image from io import BytesIO from striprtf.striprtf import rtf_to_text logging.basicConfig(level=logging.ERROR) device = -1 # CPU-only print("⚠️ CPU-only. Expect ~10–15s for 300,000 chars.") try: summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32) except Exception as e: print(f"❌ Model loading failed: {str(e)}") exit(1) def summarize_file(file): start = time.time() print(f"File: {file.name if hasattr(file, 'name') else 'unknown'}") try: file_bytes = file.read() if hasattr(file, 'read') else file mime, _ = mimetypes.guess_type(file.name) if hasattr(file, 'name') else (None, None) text = "" if mime == 'application/pdf': doc = fitz.open(stream=file_bytes, filetype="pdf") text = "".join(page.get_text("text") for page in doc) elif mime in ['text/plain', 'text/rtf']: text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore") elif mime in ['text/csv', 'application/vnd.ms-excel']: text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten()) elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': doc = docx.Document(BytesIO(file_bytes)) text = " ".join(p.text for p in doc.paragraphs if p.text) elif mime in ['image/jpeg', 'image/png']: img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300)) text = pytesseract.image_to_string(img) elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl') text = " ".join(df.astype(str).values.flatten()) else: text = textract.process(file_bytes).decode("utf-8", errors="ignore") text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text) text = re.sub(r"\\cap", "intersection", text) text = re.sub(r"\s+", " ", text).strip() text = "".join(c for c in text if ord(c) < 128) print(f"Extracted chars: {len(text)}") except Exception as e: return f"❌ Text extraction failed: {str(e)}" if not text.strip(): return "❌ No text found" text = text[:300000] chunks = [text[i:i+1000] for i in range(0, len(text), 1000)] print(f"Chunks created: {len(chunks)}") if not chunks: return "❌ No chunks to summarize" summaries = [] for i in range(0, len(chunks), 4): if time.time() - start > 15: summaries.append("⚠️ Stopped early") break batch = chunks[i:i+4] if any(sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7 for chunk in batch): summaries.append(f"**Chunk {i+1}–{i+len(batch)}**: Skipped (equation-heavy)") continue try: results = summarizer(batch, max_length=50, min_length=10, do_sample=False) summaries.extend(f"**Chunk {i+j+1}**:\n{r['summary_text']}" for j, r in enumerate(results)) except Exception as e: summaries.append(f"**Chunk {i+1}–{i+len(batch)}**: ❌ Error: {str(e)}") return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries) demo = gr.Interface( fn=summarize_file, inputs=gr.File(label="📄 Any File", type="file"), outputs=gr.Textbox(label="📝 Summary"), title="Fast Summarizer", description="300,000+ chars in ~10–15s (CPU)" ) if __name__ == "__main__": try: demo.launch(share=False, server_port=7860) except Exception as e: print(f"❌ Gradio launch failed: {str(e)}")