Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz | |
| import torch | |
| from transformers import pipeline | |
| import time, logging, re, pandas as pd, docx, pytesseract, openpyxl, textract, mimetypes | |
| from PIL import Image | |
| from io import BytesIO | |
| from striprtf.striprtf import rtf_to_text | |
| import tempfile | |
| logging.basicConfig(level=logging.ERROR) | |
| device = -1 # CPU-only | |
| print("β οΈ CPU-only. Expect ~5β9s for 300,000 chars.") | |
| try: | |
| summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32) | |
| except Exception as e: | |
| print(f"β Model loading failed: {str(e)}") | |
| exit(1) | |
| def summarize_file_bytes(file_bytes, filename): | |
| start = time.time() | |
| try: | |
| if not isinstance(file_bytes, bytes) or len(file_bytes) == 0: | |
| return f"β {filename}: Invalid or empty file", "" | |
| mime, _ = mimetypes.guess_type(filename) or ('text/plain', None) | |
| text = "" | |
| if mime == 'application/pdf': | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| text = "".join(page.get_text("text") for page in doc) | |
| elif mime in ['text/plain', 'text/rtf']: | |
| text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore") | |
| elif mime in ['text/csv', 'application/vnd.ms-excel']: | |
| text = " ".join(pd.read_csv(BytesIO(file_bytes)).astype(str).values.flatten()) | |
| elif mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': | |
| doc = docx.Document(BytesIO(file_bytes)) | |
| text = " ".join(p.text for p in doc.paragraphs if p.text) | |
| elif mime in ['image/jpeg', 'image/png']: | |
| img = Image.open(BytesIO(file_bytes)).convert('L').resize((int(img.width * 300 / img.height), 300)) | |
| text = pytesseract.image_to_string(img) | |
| elif mime == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': | |
| df = pd.read_excel(BytesIO(file_bytes), engine='openpyxl') | |
| text = " ".join(df.astype(str).values.flatten()) | |
| else: | |
| text = textract.process(file_bytes).decode("utf-8", errors="ignore") | |
| text = re.sub(r"[^\x20-\x7E]", "", text) # Printable ASCII only | |
| text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text) | |
| text = re.sub(r"\\cap", "intersection", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| if not text or len(text) < 100 or sum(1 for c in text if c.isalnum()) < 50: | |
| return f"β {filename}: Invalid or too short text", "" | |
| print(f"Extracted chars for {filename}: {len(text)}") | |
| except Exception as e: | |
| return f"β {filename}: Text extraction failed: {str(e)}", "" | |
| text = text[:300000] | |
| chunks = [text[i:i+1000] for i in range(0, len(text), 1000)] | |
| print(f"Chunks for {filename}: {len(chunks)}") | |
| if not chunks: | |
| return f"β {filename}: No chunks to summarize", "" | |
| selected_indices = [int(i * len(chunks) / 12) for i in range(12)] if len(chunks) >= 12 else list(range(len(chunks))) | |
| summaries = [] | |
| for i in selected_indices: | |
| chunk = chunks[i] | |
| if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.7: | |
| summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)") | |
| continue | |
| try: | |
| summary = summarizer(chunk, max_length=40, min_length=10, do_sample=False)[0]['summary_text'] | |
| summaries.append(f"**Chunk {i+1}**:\n{summary}") | |
| except Exception as e: | |
| summaries.append(f"**Chunk {i+1}**: β Error: {str(e)}") | |
| while len(summaries) < 12: | |
| summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content") | |
| summary_text = f"π **{filename}**\n**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12]) | |
| return summary_text, summary_text | |
| def summarize_multiple_files(*file_objs): | |
| if not file_objs or not any(file_objs): | |
| return "β No files uploaded", None | |
| all_summaries = [] | |
| combined_text = "" | |
| for file in file_objs[0] if isinstance(file_objs[0], list) else file_objs: | |
| if not hasattr(file, 'read') or not hasattr(file, 'name'): | |
| all_summaries.append(f"β Invalid file: Missing read() or name") | |
| continue | |
| filename = file.name.split("/")[-1] | |
| file_bytes = file.read() | |
| summary, raw_text = summarize_file_bytes(file_bytes, filename) | |
| all_summaries.append(summary) | |
| combined_text += f"\n\n{raw_text}\n" + "="*60 + "\n" | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as f: | |
| f.write(combined_text) | |
| summary_file_path = f.name | |
| return "\n\n".join(all_summaries), summary_file_path | |
| demo = gr.Interface( | |
| fn=summarize_multiple_files, | |
| inputs=gr.File(label="π Upload Any File", type="binary", file_count="multiple"), | |
| outputs=[ | |
| gr.Textbox(label="π Summary", lines=15, max_lines=100), | |
| gr.File(label="π₯ Download Summary as .txt") | |
| ], | |
| title="π Multi-File Summarizer", | |
| description="Summarizes any file into exactly 15 lines. Download as .txt. ~5β9s for 300,000 chars (CPU)." | |
| ) | |
| if __name__ == "__main__": | |
| try: | |
| demo.launch(share=False, server_port=7860) | |
| except Exception as e: | |
| print(f"β Gradio launch failed: {str(e)}") | |