import gradio as gr import re from transformers import pipeline, AutoTokenizer from PyPDF2 import PdfReader import tempfile # ========================= # Model setup (CPU-safe) # ========================= # Use smaller, faster models to speed up processing MODEL_NAME = "sshleifer/distilbart-cnn-6-6" # Smaller than 12-6, faster on CPU tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) summarizer = pipeline( "summarization", model=MODEL_NAME, tokenizer=tokenizer, device=-1 # CPU only ) # Use smaller flan-t5-small for faster advice generation advice_generator = pipeline( "text2text-generation", model="google/flan-t5-small", device=-1 # CPU only ) CHUNK_SIZE = 900 # safe margin under typical max input # ========================= # Utilities # ========================= def clean_text(text: str) -> str: """Fix quotes, spacing, repetition, broken punctuation.""" text = text.replace("‘", "'").replace("’", "'") text = text.replace("“", '"').replace("”", '"') text = re.sub(r"[.]{2,}", ".", text) text = re.sub(r"[']{2,}", "'", text) text = re.sub(r"\s+", " ", text) sentences = re.split(r'(?<=[.!?])\s+', text) seen = set() result = [] for s in sentences: key = s.strip().lower() if key and key not in seen: seen.add(key) result.append(s.strip()) return " ".join(result) def chunk_text(text: str): """Token-aware chunking to avoid model overflow.""" tokens = tokenizer.encode(text, add_special_tokens=False) chunks = [] for i in range(0, len(tokens), CHUNK_SIZE): chunk_tokens = tokens[i:i + CHUNK_SIZE] chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True) chunks.append(chunk_text) return chunks def generate_ai_advice(summary: str) -> str: """Generate personalized study advice based on the paper summary.""" truncated_summary = summary[:1000] prompt = ( f"Read this summary of a technical paper: '{truncated_summary}'\n\n" "Generate exactly 5 practical study tips for a student to better understand and retain this content. " "Focus on active learning techniques, like practice, visualization, or connections to real-world applications. " "Make each tip start with a verb (e.g., 'Review...', 'Apply...') and keep them concise. " "Output only the 5 tips as bullet points, nothing else." ) generated = advice_generator( prompt, max_length=250, num_return_sequences=1, do_sample=False, temperature=0.7 )[0]["generated_text"] # Try to clean into bullet points tips = [tip.strip() for tip in generated.split('\n') if tip.strip().startswith('-') or tip.strip()] if not tips or len(tips) < 3: tips = [t.strip() for t in generated.split('.') if t.strip()] advice_md = "\n\n---\n\n### 📚 AI-Generated Study Tips\n\n" for i, tip in enumerate(tips[:5], 1): clean_tip = tip.lstrip('- ').strip() advice_md += f"- {clean_tip}\n" advice_md += "\n**Pro tip**: Combine these with spaced repetition (Anki / Quizlet) for long-term retention!" return advice_md def extract_possible_headings(text: str) -> str: """Attempt to extract potential titles and subtitles from raw text. This is a simple heuristic: short lines, all caps, or starting with numbers/sections.""" lines = text.split('\n') headings = [] for line in lines: stripped = line.strip() if stripped and (len(stripped) < 80) and (stripped.isupper() or re.match(r'^\d+\.?\s', stripped) or re.match(r'^[A-Z][a-z]+\s[A-Z]', stripped)): headings.append(stripped) if headings: return "### Extracted Possible Headings/Subtitles\n\n" + "\n- ".join([''] + headings) + "\n\n---\n\n" return "" def summarize_long_text(text: str, progress=gr.Progress()) -> str: """Summarize long text in chunks + add AI study advice. Now with longer summaries per chunk and formatted as bullet points.""" if not text or len(text.strip()) == 0: return "No text provided." progress(0, desc="Extracting headings...") # Extract possible headings first headings_section = extract_possible_headings(text) progress(0.1, desc="Chunking text...") chunks = chunk_text(text) summaries = [] progress(0.2, desc="Summarizing chunks...") for i in progress.tqdm(range(len(chunks))): chunk = chunks[i] try: summary = summarizer( chunk, max_length=200, # Reduced slightly for speed (compromise between length and time) min_length=60, # Reduced for speed do_sample=False )[0]["summary_text"] cleaned = clean_text(summary) summaries.append(f"**Chunk {i+1} Summary:** {cleaned}") except Exception: pass # skip problematic chunks # Format summaries as bullet points summary_md = "### Detailed Summary (in Bullet Points)\n\n" for s in summaries: summary_md += f"- {s}\n" progress(0.8, desc="Generating AI advice...") ai_advice = generate_ai_advice(summary_md) # Use the bulleted summary for advice generation progress(1, desc="Done!") return headings_section + summary_md + ai_advice def read_pdf(file) -> str: """Safely extract text from PDF.""" try: reader = PdfReader(file) pages = [page.extract_text() or "" for page in reader.pages] return "\n".join(pages) # Join with newlines to preserve line breaks for heading detection except Exception as e: return f"PDF read error: {str(e)}" # ========================= # Download helper # ========================= def create_download_file(content: str) -> str: """Create temporary file for Gradio file download component""" with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as tmp: tmp.write(content) return tmp.name # ========================= # Main handler # ========================= def process_input(text: str, file, progress=gr.Progress()): input_text = "" progress(0, desc="Reading input...") if file is not None: input_text = read_pdf(file) elif text.strip(): input_text = text else: return "Please paste some text or upload a PDF.", None result = summarize_long_text(input_text, progress) download_path = create_download_file(result) return result, download_path # ========================= # Gradio UI # ========================= with gr.Blocks() as demo: gr.Markdown("# 📄 Long Text Summarizer + AI Study Assistant") gr.Markdown( "• Handles very long documents (thousands of words)\n" "• Supports **PDF** upload or direct paste\n" "• Runs on CPU – works on free hardware\n" "• Gives you **longer, bullet-point summaries** with possible headings/subtitles\n" "• Includes **5 AI-generated study tips** tailored to the content\n" "• Download result as .txt file\n" "**Note**: Processing may take time for long documents on CPU (initial model load + inference). Please be patient!" ) with gr.Row(): text_input = gr.Textbox( lines=10, label="Paste your text here (optional)", placeholder="Paste lecture notes, article, book chapter...", ) file_input = gr.File( label="Or upload a PDF", file_types=[".pdf"] ) summarize_btn = gr.Button("Summarize & Get Study Tips", variant="primary") output = gr.Textbox( lines=16, label="Summary + AI-generated study advice", interactive=False ) download_output = gr.File( label="Download full result (.txt)", interactive=False ) summarize_btn.click( fn=process_input, inputs=[text_input, file_input], outputs=[output, download_output] ) demo.launch()