File size: 8,133 Bytes
487a5d4 b91ee99 641953a b898d31 b91ee99 bb331f0 b898d31 b91ee99 bb331f0 8c3fb35 bb331f0 8c3fb35 d8547ca 641953a 6a4c4d0 b91ee99 2f00a52 b91ee99 641953a b91ee99 b898d31 b91ee99 d8547ca b91ee99 487a5d4 b91ee99 b898d31 487a5d4 8c3fb35 641953a af3ae44 8c3fb35 af3ae44 8c3fb35 6a4c4d0 af3ae44 641953a af3ae44 641953a af3ae44 641953a af3ae44 8c3fb35 af3ae44 641953a 6a4c4d0 aba9518 bb331f0 aba9518 b91ee99 8c3fb35 bb331f0 aba9518 bb331f0 b898d31 8c3fb35 bb331f0 641953a bb331f0 641953a aba9518 bb331f0 641953a 8c3fb35 aba9518 bb331f0 aba9518 2f00a52 bb331f0 aba9518 b91ee99 d8547ca b91ee99 aba9518 b91ee99 641953a b91ee99 bb331f0 641953a bb331f0 b91ee99 641953a bb331f0 641953a b91ee99 2f00a52 8c3fb35 b91ee99 641953a aba9518 bb331f0 b91ee99 8c3fb35 641953a 8c3fb35 641953a 8c3fb35 b91ee99 6a4c4d0 641953a b91ee99 8c3fb35 641953a 8c3fb35 b91ee99 641953a b91ee99 487a5d4 8091a97 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
import gradio as gr
import re
from transformers import pipeline, AutoTokenizer
from PyPDF2 import PdfReader
import tempfile
# =========================
# Model setup (CPU-safe)
# =========================
# Use smaller, faster models to speed up processing
MODEL_NAME = "sshleifer/distilbart-cnn-6-6" # Smaller than 12-6, faster on CPU
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
summarizer = pipeline(
"summarization",
model=MODEL_NAME,
tokenizer=tokenizer,
device=-1 # CPU only
)
# Use smaller flan-t5-small for faster advice generation
advice_generator = pipeline(
"text2text-generation",
model="google/flan-t5-small",
device=-1 # CPU only
)
CHUNK_SIZE = 900 # safe margin under typical max input
# =========================
# Utilities
# =========================
def clean_text(text: str) -> str:
"""Fix quotes, spacing, repetition, broken punctuation."""
text = text.replace("β", "'").replace("β", "'")
text = text.replace("β", '"').replace("β", '"')
text = re.sub(r"[.]{2,}", ".", text)
text = re.sub(r"[']{2,}", "'", text)
text = re.sub(r"\s+", " ", text)
sentences = re.split(r'(?<=[.!?])\s+', text)
seen = set()
result = []
for s in sentences:
key = s.strip().lower()
if key and key not in seen:
seen.add(key)
result.append(s.strip())
return " ".join(result)
def chunk_text(text: str):
"""Token-aware chunking to avoid model overflow."""
tokens = tokenizer.encode(text, add_special_tokens=False)
chunks = []
for i in range(0, len(tokens), CHUNK_SIZE):
chunk_tokens = tokens[i:i + CHUNK_SIZE]
chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
chunks.append(chunk_text)
return chunks
def generate_ai_advice(summary: str) -> str:
"""Generate personalized study advice based on the paper summary."""
truncated_summary = summary[:1000]
prompt = (
f"Read this summary of a technical paper: '{truncated_summary}'\n\n"
"Generate exactly 5 practical study tips for a student to better understand and retain this content. "
"Focus on active learning techniques, like practice, visualization, or connections to real-world applications. "
"Make each tip start with a verb (e.g., 'Review...', 'Apply...') and keep them concise. "
"Output only the 5 tips as bullet points, nothing else."
)
generated = advice_generator(
prompt,
max_length=250,
num_return_sequences=1,
do_sample=False,
temperature=0.7
)[0]["generated_text"]
# Try to clean into bullet points
tips = [tip.strip() for tip in generated.split('\n') if tip.strip().startswith('-') or tip.strip()]
if not tips or len(tips) < 3:
tips = [t.strip() for t in generated.split('.') if t.strip()]
advice_md = "\n\n---\n\n### π AI-Generated Study Tips\n\n"
for i, tip in enumerate(tips[:5], 1):
clean_tip = tip.lstrip('- ').strip()
advice_md += f"- {clean_tip}\n"
advice_md += "\n**Pro tip**: Combine these with spaced repetition (Anki / Quizlet) for long-term retention!"
return advice_md
def extract_possible_headings(text: str) -> str:
"""Attempt to extract potential titles and subtitles from raw text.
This is a simple heuristic: short lines, all caps, or starting with numbers/sections."""
lines = text.split('\n')
headings = []
for line in lines:
stripped = line.strip()
if stripped and (len(stripped) < 80) and (stripped.isupper() or re.match(r'^\d+\.?\s', stripped) or re.match(r'^[A-Z][a-z]+\s[A-Z]', stripped)):
headings.append(stripped)
if headings:
return "### Extracted Possible Headings/Subtitles\n\n" + "\n- ".join([''] + headings) + "\n\n---\n\n"
return ""
def summarize_long_text(text: str, progress=gr.Progress()) -> str:
"""Summarize long text in chunks + add AI study advice.
Now with longer summaries per chunk and formatted as bullet points."""
if not text or len(text.strip()) == 0:
return "No text provided."
progress(0, desc="Extracting headings...")
# Extract possible headings first
headings_section = extract_possible_headings(text)
progress(0.1, desc="Chunking text...")
chunks = chunk_text(text)
summaries = []
progress(0.2, desc="Summarizing chunks...")
for i in progress.tqdm(range(len(chunks))):
chunk = chunks[i]
try:
summary = summarizer(
chunk,
max_length=200, # Reduced slightly for speed (compromise between length and time)
min_length=60, # Reduced for speed
do_sample=False
)[0]["summary_text"]
cleaned = clean_text(summary)
summaries.append(f"**Chunk {i+1} Summary:** {cleaned}")
except Exception:
pass # skip problematic chunks
# Format summaries as bullet points
summary_md = "### Detailed Summary (in Bullet Points)\n\n"
for s in summaries:
summary_md += f"- {s}\n"
progress(0.8, desc="Generating AI advice...")
ai_advice = generate_ai_advice(summary_md) # Use the bulleted summary for advice generation
progress(1, desc="Done!")
return headings_section + summary_md + ai_advice
def read_pdf(file) -> str:
"""Safely extract text from PDF."""
try:
reader = PdfReader(file)
pages = [page.extract_text() or "" for page in reader.pages]
return "\n".join(pages) # Join with newlines to preserve line breaks for heading detection
except Exception as e:
return f"PDF read error: {str(e)}"
# =========================
# Download helper
# =========================
def create_download_file(content: str) -> str:
"""Create temporary file for Gradio file download component"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as tmp:
tmp.write(content)
return tmp.name
# =========================
# Main handler
# =========================
def process_input(text: str, file, progress=gr.Progress()):
input_text = ""
progress(0, desc="Reading input...")
if file is not None:
input_text = read_pdf(file)
elif text.strip():
input_text = text
else:
return "Please paste some text or upload a PDF.", None
result = summarize_long_text(input_text, progress)
download_path = create_download_file(result)
return result, download_path
# =========================
# Gradio UI
# =========================
with gr.Blocks() as demo:
gr.Markdown("# π Long Text Summarizer + AI Study Assistant")
gr.Markdown(
"β’ Handles very long documents (thousands of words)\n"
"β’ Supports **PDF** upload or direct paste\n"
"β’ Runs on CPU β works on free hardware\n"
"β’ Gives you **longer, bullet-point summaries** with possible headings/subtitles\n"
"β’ Includes **5 AI-generated study tips** tailored to the content\n"
"β’ Download result as .txt file\n"
"**Note**: Processing may take time for long documents on CPU (initial model load + inference). Please be patient!"
)
with gr.Row():
text_input = gr.Textbox(
lines=10,
label="Paste your text here (optional)",
placeholder="Paste lecture notes, article, book chapter...",
)
file_input = gr.File(
label="Or upload a PDF",
file_types=[".pdf"]
)
summarize_btn = gr.Button("Summarize & Get Study Tips", variant="primary")
output = gr.Textbox(
lines=16,
label="Summary + AI-generated study advice",
interactive=False
)
download_output = gr.File(
label="Download full result (.txt)",
interactive=False
)
summarize_btn.click(
fn=process_input,
inputs=[text_input, file_input],
outputs=[output, download_output]
)
demo.launch() |