Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz | |
| import torch | |
| from transformers import pipeline | |
| import time, io | |
| device = 0 if torch.cuda.is_available() else -1 | |
| if device == -1: print("β οΈ No GPU detected. Expect ~10β20s for 300,000 chars on CPU.") | |
| summarizer = pipeline("summarization", model="google/pegasus-xsum", device=device, torch_dtype=torch.int8) | |
| def extract_text(file_bytes): | |
| if file_bytes[:4].startswith(b'%PDF'): | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| text = "".join(page.get_text("text", flags=16) for page in doc) | |
| doc.close() | |
| return text | |
| try: return file_bytes.decode("utf-8") | |
| except: return "β Unsupported format (PDF/TXT only)" | |
| async def summarize_file(file_bytes): | |
| start = time.time() | |
| text = extract_text(file_bytes)[:300000] or "β No text found" | |
| if len(text.strip()) == 0: return text | |
| chunks = [text[i:i+15000] for i in range(0, len(text), 15000)] | |
| if not chunks: return "β No chunks to summarize" | |
| summaries = [] | |
| batch_size = 2 if device == -1 else 10 # Smaller batch for CPU | |
| for i in range(0, len(chunks), batch_size): | |
| if time.time() - start > 9: | |
| summaries.append("β οΈ Stopped early") | |
| break | |
| batch = chunks[i:i+batch_size] | |
| try: | |
| batch_summaries = summarizer(batch, max_length=40, min_length=10, do_sample=False, batch_size=batch_size) | |
| summaries.extend(f"**Chunk {i+j+1}**:\n{s['summary_text']}" for j, s in enumerate(batch_summaries)) | |
| except: summaries.append(f"**Chunk {i+1}**: β Error") | |
| return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries) | |
| demo = gr.Interface( | |
| fn=summarize_file, inputs=gr.File(label="π PDF/TXT Notes"), | |
| outputs=gr.Textbox(label="π Summary"), | |
| title="Fast Summarizer", description="300,000+ chars in ~5β10s (GPU) or ~10β20s (CPU)" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) |