Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import torch | |
| from transformers import pipeline | |
| import time, logging | |
| import re | |
| import tempfile | |
| import os | |
| # === Setup === | |
| logging.basicConfig(level=logging.ERROR) | |
| device = -1 # CPU | |
| print("β οΈ CPU-only mode. Expect ~20β30s for large documents.") | |
| # === Load summarization model === | |
| try: | |
| summarizer = pipeline("summarization", model="t5-small", device=device, torch_dtype=torch.float32) | |
| except Exception as e: | |
| print(f"β Model loading failed: {e}") | |
| exit(1) | |
| # === Text Preprocessing === | |
| def smart_chunk(text, max_chunk_len=2000): | |
| sentences = re.split(r'(?<=[.!?]) +', text) | |
| chunks, current_chunk = [], "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) < max_chunk_len: | |
| current_chunk += sentence + " " | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence + " " | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| # === Summarization per file === | |
| def summarize_file_bytes(file_bytes, filename): | |
| start_time = time.time() | |
| try: | |
| if file_bytes[:4].startswith(b'%PDF'): | |
| text = "".join(page.get_text("text", flags=16) for page in fitz.open(stream=file_bytes, filetype="pdf")) | |
| else: | |
| text = file_bytes.decode("utf-8", errors="ignore") | |
| except Exception as e: | |
| return f"{filename}: β Text extraction failed: {e}", "" | |
| text = text.strip() | |
| if not text: | |
| return f"{filename}: β No text found.", "" | |
| text = text[:300000] | |
| chunks = smart_chunk(text) | |
| summaries, line_count = [], 0 | |
| for i, chunk in enumerate(chunks): | |
| if time.time() - start_time > 20: | |
| summaries.append("β οΈ Stopped early due to time limit.") | |
| break | |
| try: | |
| summary = summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]["summary_text"] | |
| summaries.append(f"**Chunk {i+1}**:\n{summary.strip()}") | |
| line_count += summary.count('\n') + 1 | |
| if line_count >= 15: | |
| break | |
| except Exception as e: | |
| summaries.append(f"**Chunk {i+1}**: β Error summarizing: {e}") | |
| total_time = time.time() - start_time | |
| summary_text = f"π **{filename}**\n**Characters**: {len(text)} | **Time**: {total_time:.2f}s\n\n" + "\n\n".join(summaries) | |
| return summary_text, summary_text | |
| # === Gradio Wrapper === | |
| def summarize_multiple_files(files): | |
| all_summaries = [] | |
| combined_text = "" | |
| for file_obj in files: | |
| file_bytes = file_obj.read() | |
| filename = file_obj.name.split("/")[-1] | |
| summary, raw = summarize_file_bytes(file_bytes, filename) | |
| all_summaries.append(summary) | |
| combined_text += f"\n\n{raw}\n" + "="*60 + "\n" | |
| # Write summary to temp .txt file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as f: | |
| f.write(combined_text) | |
| summary_file_path = f.name | |
| return "\n\n".join(all_summaries), summary_file_path | |
| # === Gradio Interface === | |
| demo = gr.Interface( | |
| fn=summarize_multiple_files, | |
| inputs=gr.File(label="π Upload PDF or TXT files", file_types=[".pdf", ".txt"], type="file", file_count="multiple"), | |
| outputs=[ | |
| gr.Textbox(label="π Summary", lines=30, max_lines=100), | |
| gr.File(label="π₯ Download Summary as .txt") | |
| ], | |
| title="π Multi-File Summarizer", | |
| description="Summarizes multiple PDFs or TXTs into at least 15 lines each. Download final output as .txt. CPU-optimized." | |
| ) | |
| if __name__ == "__main__": | |
| try: | |
| demo.launch(share=False, server_port=7860) | |
| except Exception as e: | |
| print(f"β Gradio launch failed: {e}") | |