textSum / app.py
Azidan's picture
Update app.py
bb331f0 verified
import gradio as gr
import re
from transformers import pipeline, AutoTokenizer
from PyPDF2 import PdfReader
import tempfile
# =========================
# Model setup (CPU-safe)
# =========================
# Use smaller, faster models to speed up processing
MODEL_NAME = "sshleifer/distilbart-cnn-6-6" # Smaller than 12-6, faster on CPU
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
summarizer = pipeline(
"summarization",
model=MODEL_NAME,
tokenizer=tokenizer,
device=-1 # CPU only
)
# Use smaller flan-t5-small for faster advice generation
advice_generator = pipeline(
"text2text-generation",
model="google/flan-t5-small",
device=-1 # CPU only
)
CHUNK_SIZE = 900 # safe margin under typical max input
# =========================
# Utilities
# =========================
def clean_text(text: str) -> str:
"""Fix quotes, spacing, repetition, broken punctuation."""
text = text.replace("β€˜", "'").replace("’", "'")
text = text.replace("β€œ", '"').replace("”", '"')
text = re.sub(r"[.]{2,}", ".", text)
text = re.sub(r"[']{2,}", "'", text)
text = re.sub(r"\s+", " ", text)
sentences = re.split(r'(?<=[.!?])\s+', text)
seen = set()
result = []
for s in sentences:
key = s.strip().lower()
if key and key not in seen:
seen.add(key)
result.append(s.strip())
return " ".join(result)
def chunk_text(text: str):
"""Token-aware chunking to avoid model overflow."""
tokens = tokenizer.encode(text, add_special_tokens=False)
chunks = []
for i in range(0, len(tokens), CHUNK_SIZE):
chunk_tokens = tokens[i:i + CHUNK_SIZE]
chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
chunks.append(chunk_text)
return chunks
def generate_ai_advice(summary: str) -> str:
"""Generate personalized study advice based on the paper summary."""
truncated_summary = summary[:1000]
prompt = (
f"Read this summary of a technical paper: '{truncated_summary}'\n\n"
"Generate exactly 5 practical study tips for a student to better understand and retain this content. "
"Focus on active learning techniques, like practice, visualization, or connections to real-world applications. "
"Make each tip start with a verb (e.g., 'Review...', 'Apply...') and keep them concise. "
"Output only the 5 tips as bullet points, nothing else."
)
generated = advice_generator(
prompt,
max_length=250,
num_return_sequences=1,
do_sample=False,
temperature=0.7
)[0]["generated_text"]
# Try to clean into bullet points
tips = [tip.strip() for tip in generated.split('\n') if tip.strip().startswith('-') or tip.strip()]
if not tips or len(tips) < 3:
tips = [t.strip() for t in generated.split('.') if t.strip()]
advice_md = "\n\n---\n\n### πŸ“š AI-Generated Study Tips\n\n"
for i, tip in enumerate(tips[:5], 1):
clean_tip = tip.lstrip('- ').strip()
advice_md += f"- {clean_tip}\n"
advice_md += "\n**Pro tip**: Combine these with spaced repetition (Anki / Quizlet) for long-term retention!"
return advice_md
def extract_possible_headings(text: str) -> str:
"""Attempt to extract potential titles and subtitles from raw text.
This is a simple heuristic: short lines, all caps, or starting with numbers/sections."""
lines = text.split('\n')
headings = []
for line in lines:
stripped = line.strip()
if stripped and (len(stripped) < 80) and (stripped.isupper() or re.match(r'^\d+\.?\s', stripped) or re.match(r'^[A-Z][a-z]+\s[A-Z]', stripped)):
headings.append(stripped)
if headings:
return "### Extracted Possible Headings/Subtitles\n\n" + "\n- ".join([''] + headings) + "\n\n---\n\n"
return ""
def summarize_long_text(text: str, progress=gr.Progress()) -> str:
"""Summarize long text in chunks + add AI study advice.
Now with longer summaries per chunk and formatted as bullet points."""
if not text or len(text.strip()) == 0:
return "No text provided."
progress(0, desc="Extracting headings...")
# Extract possible headings first
headings_section = extract_possible_headings(text)
progress(0.1, desc="Chunking text...")
chunks = chunk_text(text)
summaries = []
progress(0.2, desc="Summarizing chunks...")
for i in progress.tqdm(range(len(chunks))):
chunk = chunks[i]
try:
summary = summarizer(
chunk,
max_length=200, # Reduced slightly for speed (compromise between length and time)
min_length=60, # Reduced for speed
do_sample=False
)[0]["summary_text"]
cleaned = clean_text(summary)
summaries.append(f"**Chunk {i+1} Summary:** {cleaned}")
except Exception:
pass # skip problematic chunks
# Format summaries as bullet points
summary_md = "### Detailed Summary (in Bullet Points)\n\n"
for s in summaries:
summary_md += f"- {s}\n"
progress(0.8, desc="Generating AI advice...")
ai_advice = generate_ai_advice(summary_md) # Use the bulleted summary for advice generation
progress(1, desc="Done!")
return headings_section + summary_md + ai_advice
def read_pdf(file) -> str:
"""Safely extract text from PDF."""
try:
reader = PdfReader(file)
pages = [page.extract_text() or "" for page in reader.pages]
return "\n".join(pages) # Join with newlines to preserve line breaks for heading detection
except Exception as e:
return f"PDF read error: {str(e)}"
# =========================
# Download helper
# =========================
def create_download_file(content: str) -> str:
"""Create temporary file for Gradio file download component"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as tmp:
tmp.write(content)
return tmp.name
# =========================
# Main handler
# =========================
def process_input(text: str, file, progress=gr.Progress()):
input_text = ""
progress(0, desc="Reading input...")
if file is not None:
input_text = read_pdf(file)
elif text.strip():
input_text = text
else:
return "Please paste some text or upload a PDF.", None
result = summarize_long_text(input_text, progress)
download_path = create_download_file(result)
return result, download_path
# =========================
# Gradio UI
# =========================
with gr.Blocks() as demo:
gr.Markdown("# πŸ“„ Long Text Summarizer + AI Study Assistant")
gr.Markdown(
"β€’ Handles very long documents (thousands of words)\n"
"β€’ Supports **PDF** upload or direct paste\n"
"β€’ Runs on CPU – works on free hardware\n"
"β€’ Gives you **longer, bullet-point summaries** with possible headings/subtitles\n"
"β€’ Includes **5 AI-generated study tips** tailored to the content\n"
"β€’ Download result as .txt file\n"
"**Note**: Processing may take time for long documents on CPU (initial model load + inference). Please be patient!"
)
with gr.Row():
text_input = gr.Textbox(
lines=10,
label="Paste your text here (optional)",
placeholder="Paste lecture notes, article, book chapter...",
)
file_input = gr.File(
label="Or upload a PDF",
file_types=[".pdf"]
)
summarize_btn = gr.Button("Summarize & Get Study Tips", variant="primary")
output = gr.Textbox(
lines=16,
label="Summary + AI-generated study advice",
interactive=False
)
download_output = gr.File(
label="Download full result (.txt)",
interactive=False
)
summarize_btn.click(
fn=process_input,
inputs=[text_input, file_input],
outputs=[output, download_output]
)
demo.launch()