Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import fitz | |
| import torch | |
| from transformers import pipeline | |
| import time, logging, re | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| import matplotlib.pyplot as plt | |
| import io | |
| from PIL import Image | |
| import nltk | |
| # Download punkt tokenizer if not already | |
| nltk.download('punkt', quiet=True) | |
| from nltk.tokenize import sent_tokenize | |
| logging.basicConfig(level=logging.ERROR) | |
| device = -1 # CPU-only | |
| print("β οΈ Optimized CPU-only version.") | |
| try: | |
| summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device) | |
| except Exception as e: | |
| print(f"β Model loading failed: {str(e)}") | |
| exit(1) | |
| def visualize_chunk_status(chunk_data): | |
| status_colors = {'summarized': 'green', 'skipped': 'orange', 'error': 'red'} | |
| labels = [f"C{i['chunk']}" for i in chunk_data] | |
| colors = [status_colors.get(i['status'], 'gray') for i in chunk_data] | |
| times = [i.get('time', 0.1) for i in chunk_data] | |
| fig, ax = plt.subplots(figsize=(10, 2.5)) | |
| ax.barh(labels, times, color=colors) | |
| ax.set_xlabel("Time (s)") | |
| ax.set_title("π Chunk Processing Status") | |
| plt.tight_layout() | |
| buf = io.BytesIO() | |
| plt.savefig(buf, format='png') | |
| buf.seek(0) | |
| plt.close(fig) | |
| return Image.open(buf) | |
| def create_summary_flowchart(summaries): | |
| # Filter only successful summaries | |
| filtered = [] | |
| for s in summaries: | |
| if s.startswith("**Chunk") and "Skipped" not in s and "Error" not in s: | |
| parts = s.split("**:", 1) | |
| if len(parts) > 1: | |
| filtered.append(parts[1].strip()) | |
| if not filtered: | |
| return None | |
| fig_height = max(2, len(filtered) * 1.5) | |
| fig, ax = plt.subplots(figsize=(6, fig_height)) | |
| ax.axis('off') | |
| ypos = list(range(len(filtered) * 2, 0, -2)) | |
| boxprops = dict(boxstyle="round,pad=0.5", facecolor="lightblue", edgecolor="black") | |
| for i, (y, summary) in enumerate(zip(ypos, filtered)): | |
| summary_text = summary.replace("\n", " ").strip()[:120] | |
| if len(summary_text) == 120: | |
| summary_text += "..." | |
| ax.text(0.5, y, summary_text, ha='center', va='center', bbox=boxprops, fontsize=9) | |
| if i < len(filtered) - 1: | |
| ax.annotate('', xy=(0.5, y - 1.5), xytext=(0.5, y - 0.5), | |
| arrowprops=dict(arrowstyle="->", lw=1.5)) | |
| buf = io.BytesIO() | |
| plt.tight_layout() | |
| plt.savefig(buf, format='png') | |
| buf.seek(0) | |
| plt.close(fig) | |
| return Image.open(buf) | |
| def split_text_into_chunks(text, max_tokens=1500): | |
| sentences = sent_tokenize(text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| if len(current_chunk) + len(sentence) <= max_tokens: | |
| current_chunk += " " + sentence | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks[:20] # Limit to 20 chunks max | |
| def summarize_file(file_bytes): | |
| start = time.time() | |
| chunk_info = [] | |
| summaries = [] | |
| try: | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| text = "".join(page.get_text("text") for page in doc) | |
| text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text) | |
| text = re.sub(r"\\cap", "intersection", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| text = "".join(c for c in text if ord(c) < 128) | |
| except Exception as e: | |
| return f"β Text extraction failed: {str(e)}", None, None | |
| if not text.strip(): | |
| return "β No text found", None, None | |
| chunks = split_text_into_chunks(text) | |
| for i, chunk in enumerate(chunks): | |
| chunk_start = time.time() | |
| chunk_result = {'chunk': i + 1, 'status': '', 'time': 0} | |
| if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5: | |
| summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)") | |
| chunk_result['status'] = 'skipped' | |
| else: | |
| try: | |
| summary = summarizer(chunk, max_length=80, min_length=15, do_sample=False)[0]['summary_text'] | |
| summaries.append(f"**Chunk {i+1}**:\n{summary}") | |
| chunk_result['status'] = 'summarized' | |
| except Exception as e: | |
| summaries.append(f"**Chunk {i+1}**: β Error: {str(e)}") | |
| chunk_result['status'] = 'error' | |
| chunk_result['time'] = time.time() - chunk_start | |
| chunk_info.append(chunk_result) | |
| final_summary = f"**Processed chunks**: {len(chunks)}\n**Time**: {time.time() - start:.2f}s\n\n" + "\n\n".join(summaries) | |
| process_img = visualize_chunk_status(chunk_info) | |
| flow_img = create_summary_flowchart(summaries) | |
| return final_summary, process_img, flow_img | |
| demo = gr.Interface( | |
| fn=summarize_file, | |
| inputs=gr.File(label="π Upload PDF", type="binary"), | |
| outputs=[ | |
| gr.Textbox(label="π Summary"), | |
| gr.Image(label="π Chunk Status", type="pil"), | |
| gr.Image(label="π Flow Summary", type="pil") | |
| ], | |
| title="π PDF Summarizer with Visual Flow", | |
| description="Summarizes up to 30,000 characters from a PDF. Includes chunk status and flowchart visualizations." | |
| ) | |
| if __name__ == "__main__": | |
| try: | |
| demo.launch(share=False, server_port=7860) | |
| except Exception as e: | |
| print(f"β Gradio launch failed: {str(e)}") | |