|
|
import gradio as gr |
|
|
import re |
|
|
from transformers import pipeline, AutoTokenizer |
|
|
from PyPDF2 import PdfReader |
|
|
import tempfile |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "sshleifer/distilbart-cnn-6-6" |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
summarizer = pipeline( |
|
|
"summarization", |
|
|
model=MODEL_NAME, |
|
|
tokenizer=tokenizer, |
|
|
device=-1 |
|
|
) |
|
|
|
|
|
|
|
|
advice_generator = pipeline( |
|
|
"text2text-generation", |
|
|
model="google/flan-t5-small", |
|
|
device=-1 |
|
|
) |
|
|
|
|
|
CHUNK_SIZE = 900 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_text(text: str) -> str: |
|
|
"""Fix quotes, spacing, repetition, broken punctuation.""" |
|
|
text = text.replace("β", "'").replace("β", "'") |
|
|
text = text.replace("β", '"').replace("β", '"') |
|
|
text = re.sub(r"[.]{2,}", ".", text) |
|
|
text = re.sub(r"[']{2,}", "'", text) |
|
|
text = re.sub(r"\s+", " ", text) |
|
|
sentences = re.split(r'(?<=[.!?])\s+', text) |
|
|
seen = set() |
|
|
result = [] |
|
|
for s in sentences: |
|
|
key = s.strip().lower() |
|
|
if key and key not in seen: |
|
|
seen.add(key) |
|
|
result.append(s.strip()) |
|
|
return " ".join(result) |
|
|
|
|
|
def chunk_text(text: str): |
|
|
"""Token-aware chunking to avoid model overflow.""" |
|
|
tokens = tokenizer.encode(text, add_special_tokens=False) |
|
|
chunks = [] |
|
|
for i in range(0, len(tokens), CHUNK_SIZE): |
|
|
chunk_tokens = tokens[i:i + CHUNK_SIZE] |
|
|
chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True) |
|
|
chunks.append(chunk_text) |
|
|
return chunks |
|
|
|
|
|
def generate_ai_advice(summary: str) -> str: |
|
|
"""Generate personalized study advice based on the paper summary.""" |
|
|
truncated_summary = summary[:1000] |
|
|
|
|
|
prompt = ( |
|
|
f"Read this summary of a technical paper: '{truncated_summary}'\n\n" |
|
|
"Generate exactly 5 practical study tips for a student to better understand and retain this content. " |
|
|
"Focus on active learning techniques, like practice, visualization, or connections to real-world applications. " |
|
|
"Make each tip start with a verb (e.g., 'Review...', 'Apply...') and keep them concise. " |
|
|
"Output only the 5 tips as bullet points, nothing else." |
|
|
) |
|
|
|
|
|
generated = advice_generator( |
|
|
prompt, |
|
|
max_length=250, |
|
|
num_return_sequences=1, |
|
|
do_sample=False, |
|
|
temperature=0.7 |
|
|
)[0]["generated_text"] |
|
|
|
|
|
|
|
|
tips = [tip.strip() for tip in generated.split('\n') if tip.strip().startswith('-') or tip.strip()] |
|
|
if not tips or len(tips) < 3: |
|
|
tips = [t.strip() for t in generated.split('.') if t.strip()] |
|
|
|
|
|
advice_md = "\n\n---\n\n### π AI-Generated Study Tips\n\n" |
|
|
for i, tip in enumerate(tips[:5], 1): |
|
|
clean_tip = tip.lstrip('- ').strip() |
|
|
advice_md += f"- {clean_tip}\n" |
|
|
|
|
|
advice_md += "\n**Pro tip**: Combine these with spaced repetition (Anki / Quizlet) for long-term retention!" |
|
|
return advice_md |
|
|
|
|
|
def extract_possible_headings(text: str) -> str: |
|
|
"""Attempt to extract potential titles and subtitles from raw text. |
|
|
This is a simple heuristic: short lines, all caps, or starting with numbers/sections.""" |
|
|
lines = text.split('\n') |
|
|
headings = [] |
|
|
for line in lines: |
|
|
stripped = line.strip() |
|
|
if stripped and (len(stripped) < 80) and (stripped.isupper() or re.match(r'^\d+\.?\s', stripped) or re.match(r'^[A-Z][a-z]+\s[A-Z]', stripped)): |
|
|
headings.append(stripped) |
|
|
if headings: |
|
|
return "### Extracted Possible Headings/Subtitles\n\n" + "\n- ".join([''] + headings) + "\n\n---\n\n" |
|
|
return "" |
|
|
|
|
|
def summarize_long_text(text: str, progress=gr.Progress()) -> str: |
|
|
"""Summarize long text in chunks + add AI study advice. |
|
|
Now with longer summaries per chunk and formatted as bullet points.""" |
|
|
if not text or len(text.strip()) == 0: |
|
|
return "No text provided." |
|
|
|
|
|
progress(0, desc="Extracting headings...") |
|
|
|
|
|
headings_section = extract_possible_headings(text) |
|
|
|
|
|
progress(0.1, desc="Chunking text...") |
|
|
chunks = chunk_text(text) |
|
|
|
|
|
summaries = [] |
|
|
progress(0.2, desc="Summarizing chunks...") |
|
|
for i in progress.tqdm(range(len(chunks))): |
|
|
chunk = chunks[i] |
|
|
try: |
|
|
summary = summarizer( |
|
|
chunk, |
|
|
max_length=200, |
|
|
min_length=60, |
|
|
do_sample=False |
|
|
)[0]["summary_text"] |
|
|
cleaned = clean_text(summary) |
|
|
summaries.append(f"**Chunk {i+1} Summary:** {cleaned}") |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
summary_md = "### Detailed Summary (in Bullet Points)\n\n" |
|
|
for s in summaries: |
|
|
summary_md += f"- {s}\n" |
|
|
|
|
|
progress(0.8, desc="Generating AI advice...") |
|
|
ai_advice = generate_ai_advice(summary_md) |
|
|
|
|
|
progress(1, desc="Done!") |
|
|
return headings_section + summary_md + ai_advice |
|
|
|
|
|
def read_pdf(file) -> str: |
|
|
"""Safely extract text from PDF.""" |
|
|
try: |
|
|
reader = PdfReader(file) |
|
|
pages = [page.extract_text() or "" for page in reader.pages] |
|
|
return "\n".join(pages) |
|
|
except Exception as e: |
|
|
return f"PDF read error: {str(e)}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_download_file(content: str) -> str: |
|
|
"""Create temporary file for Gradio file download component""" |
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as tmp: |
|
|
tmp.write(content) |
|
|
return tmp.name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_input(text: str, file, progress=gr.Progress()): |
|
|
input_text = "" |
|
|
|
|
|
progress(0, desc="Reading input...") |
|
|
if file is not None: |
|
|
input_text = read_pdf(file) |
|
|
elif text.strip(): |
|
|
input_text = text |
|
|
else: |
|
|
return "Please paste some text or upload a PDF.", None |
|
|
|
|
|
result = summarize_long_text(input_text, progress) |
|
|
download_path = create_download_file(result) |
|
|
|
|
|
return result, download_path |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# π Long Text Summarizer + AI Study Assistant") |
|
|
gr.Markdown( |
|
|
"β’ Handles very long documents (thousands of words)\n" |
|
|
"β’ Supports **PDF** upload or direct paste\n" |
|
|
"β’ Runs on CPU β works on free hardware\n" |
|
|
"β’ Gives you **longer, bullet-point summaries** with possible headings/subtitles\n" |
|
|
"β’ Includes **5 AI-generated study tips** tailored to the content\n" |
|
|
"β’ Download result as .txt file\n" |
|
|
"**Note**: Processing may take time for long documents on CPU (initial model load + inference). Please be patient!" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
text_input = gr.Textbox( |
|
|
lines=10, |
|
|
label="Paste your text here (optional)", |
|
|
placeholder="Paste lecture notes, article, book chapter...", |
|
|
) |
|
|
file_input = gr.File( |
|
|
label="Or upload a PDF", |
|
|
file_types=[".pdf"] |
|
|
) |
|
|
|
|
|
summarize_btn = gr.Button("Summarize & Get Study Tips", variant="primary") |
|
|
|
|
|
output = gr.Textbox( |
|
|
lines=16, |
|
|
label="Summary + AI-generated study advice", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
download_output = gr.File( |
|
|
label="Download full result (.txt)", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
summarize_btn.click( |
|
|
fn=process_input, |
|
|
inputs=[text_input, file_input], |
|
|
outputs=[output, download_output] |
|
|
) |
|
|
|
|
|
demo.launch() |