import gradio as gr import fitz import torch from transformers import pipeline import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes from PIL import Image from io import BytesIO from striprtf.striprtf import rtf_to_text import tempfile logging.basicConfig(level=logging.ERROR) device = -1 # CPU-only print("⚠️ CPU-only. Expect ~5–9s for 300,000 chars.") try: summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32) except Exception as e: print(f"❌ Model loading failed: {str(e)}") exit(1) def summarize_file_bytes(file_bytes, filename): start = time.time() try: if not isinstance(file_bytes, bytes) or len(file_bytes) == 0: return f"❌ {filename}: Invalid or empty file", "" mime, _ = mimetypes.guess_type(filename) or ('text/plain', None) text = "" if mime == 'application/pdf': doc = fitz.open(stream=file_bytes, filetype="pdf") text = "".join(page.get_text("text") for page in doc) elif mime in ['text/plain', 'text/rtf']: text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore") elif mime in ['text/csv', 'application/vnd.ms-excel']: text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten()) elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': doc = docx.Document(BytesIO(file_bytes)) text = " ".join(p.text for p in doc.paragraphs if p.text) elif mime in ['image/jpeg', 'image/png']: img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300)) text = pytesseract.image_to_string(img) elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl') text = " ".join(df.astype(str).values.flatten()) else: text = textract.process(file_bytes).decode("utf-8", errors="ignore") text = re.sub(r"[^\x20-\x7E]", "", text) # Printable ASCII only text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text) text = re.sub(r"\\cap", "intersection", text) text = re.sub(r"\s+", " ", text).strip() if not text or len(text) < 100 or sum(1 for c in text if c.isalnum()) < 50: return f"❌ {filename}: Invalid or too short text", "" print(f"Extracted chars for {filename}: {len(text)}") except Exception as e: return f"❌ {filename}: Text extraction failed: {str(e)}", "" text = text[:300000] chunks = [text[i:i+1000] for i in range(0, len(text), 1000)] print(f"Chunks for {filename}: {len(chunks)}") if not chunks: return f"❌ {filename}: No chunks to summarize", "" selected_indices = [int(i * len(chunks) / 12) for i in range(12)] if len(chunks) >= 12 else list(range(len(chunks))) summaries = [] for i in selected_indices: chunk = chunks[i] if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7: summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)") continue try: summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)[0]['summary_text'] summaries.append(f"**Chunk {i+1}**:\n{summary}") except Exception as e: summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}") while len(summaries) < 12: summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content") summary_text = f"📄 **{filename}**\n**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12]) return summary_text, summary_text def summarize_multiple_files(*file_objs): if not file_objs or not any(file_objs): return "❌ No files uploaded", None all_summaries = [] combined_text = "" for file in file_objs[0] if isinstance(file_objs[0], list) else file_objs: if not hasattr(file, 'read') or not hasattr(file, 'name'): all_summaries.append(f"❌ Invalid file: Missing read() or name") continue filename = file.name.split("/")[-1] file_bytes = file.read() summary, raw_text = summarize_file_bytes(file_bytes, filename) all_summaries.append(summary) combined_text += f"\n\n{raw_text}\n" + "="*60 + "\n" with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as f: f.write(combined_text) summary_file_path = f.name return "\n\n".join(all_summaries), summary_file_path demo = gr.Interface( fn=summarize_multiple_files, inputs=gr.File(label="📄 Upload Any File", type="binary", file_count="multiple"), outputs=[ gr.Textbox(label="📝 Summary", lines=15, max_lines=100), gr.File(label="📥 Download Summary as .txt") ], title="📚 Multi-File Summarizer", description="Summarizes any file into exactly 15 lines. Download as .txt. ~5–9s for 300,000 chars (CPU)." ) if __name__ == "__main__": try: demo.launch(share=False, server_port=7860) except Exception as e: print(f"❌ Gradio launch failed: {str(e)}")