File size: 8,133 Bytes
487a5d4
b91ee99
 
 
641953a
b898d31
b91ee99
 
 
bb331f0
 
b898d31
b91ee99
 
 
 
 
 
 
bb331f0
8c3fb35
 
bb331f0
8c3fb35
 
d8547ca
641953a
6a4c4d0
b91ee99
2f00a52
b91ee99
 
641953a
b91ee99
 
 
 
 
 
 
 
 
 
 
 
 
 
b898d31
b91ee99
d8547ca
b91ee99
487a5d4
b91ee99
 
b898d31
 
487a5d4
 
8c3fb35
641953a
 
af3ae44
8c3fb35
af3ae44
 
 
 
 
8c3fb35
6a4c4d0
af3ae44
 
 
 
641953a
 
af3ae44
 
641953a
af3ae44
641953a
 
af3ae44
8c3fb35
af3ae44
 
 
 
641953a
6a4c4d0
 
aba9518
 
 
 
 
 
 
 
 
 
 
 
 
bb331f0
aba9518
 
b91ee99
 
8c3fb35
bb331f0
aba9518
 
 
bb331f0
b898d31
8c3fb35
bb331f0
 
 
 
641953a
 
 
bb331f0
 
641953a
 
aba9518
bb331f0
641953a
 
8c3fb35
aba9518
 
 
 
 
bb331f0
aba9518
2f00a52
bb331f0
aba9518
b91ee99
 
d8547ca
b91ee99
 
 
aba9518
b91ee99
641953a
 
 
 
 
 
 
 
 
 
b91ee99
 
 
 
bb331f0
641953a
 
bb331f0
b91ee99
641953a
 
 
 
 
 
bb331f0
641953a
 
 
b91ee99
 
 
 
2f00a52
8c3fb35
b91ee99
641953a
 
 
aba9518
 
bb331f0
 
b91ee99
8c3fb35
641953a
 
 
 
 
 
 
 
 
 
8c3fb35
641953a
8c3fb35
b91ee99
6a4c4d0
641953a
 
b91ee99
8c3fb35
641953a
 
 
 
8c3fb35
b91ee99
 
 
641953a
b91ee99
487a5d4
8091a97
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import gradio as gr
import re
from transformers import pipeline, AutoTokenizer
from PyPDF2 import PdfReader
import tempfile

# =========================
# Model setup (CPU-safe)
# =========================
# Use smaller, faster models to speed up processing
MODEL_NAME = "sshleifer/distilbart-cnn-6-6"  # Smaller than 12-6, faster on CPU
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
summarizer = pipeline(
    "summarization",
    model=MODEL_NAME,
    tokenizer=tokenizer,
    device=-1  # CPU only
)

# Use smaller flan-t5-small for faster advice generation
advice_generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device=-1  # CPU only
)

CHUNK_SIZE = 900  # safe margin under typical max input

# =========================
# Utilities
# =========================
def clean_text(text: str) -> str:
    """Fix quotes, spacing, repetition, broken punctuation."""
    text = text.replace("β€˜", "'").replace("’", "'")
    text = text.replace("β€œ", '"').replace("”", '"')
    text = re.sub(r"[.]{2,}", ".", text)
    text = re.sub(r"[']{2,}", "'", text)
    text = re.sub(r"\s+", " ", text)
    sentences = re.split(r'(?<=[.!?])\s+', text)
    seen = set()
    result = []
    for s in sentences:
        key = s.strip().lower()
        if key and key not in seen:
            seen.add(key)
            result.append(s.strip())
    return " ".join(result)

def chunk_text(text: str):
    """Token-aware chunking to avoid model overflow."""
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), CHUNK_SIZE):
        chunk_tokens = tokens[i:i + CHUNK_SIZE]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)
    return chunks

def generate_ai_advice(summary: str) -> str:
    """Generate personalized study advice based on the paper summary."""
    truncated_summary = summary[:1000]
    
    prompt = (
        f"Read this summary of a technical paper: '{truncated_summary}'\n\n"
        "Generate exactly 5 practical study tips for a student to better understand and retain this content. "
        "Focus on active learning techniques, like practice, visualization, or connections to real-world applications. "
        "Make each tip start with a verb (e.g., 'Review...', 'Apply...') and keep them concise. "
        "Output only the 5 tips as bullet points, nothing else."
    )
    
    generated = advice_generator(
        prompt,
        max_length=250,
        num_return_sequences=1,
        do_sample=False,
        temperature=0.7
    )[0]["generated_text"]
    
    # Try to clean into bullet points
    tips = [tip.strip() for tip in generated.split('\n') if tip.strip().startswith('-') or tip.strip()]
    if not tips or len(tips) < 3:
        tips = [t.strip() for t in generated.split('.') if t.strip()]
    
    advice_md = "\n\n---\n\n### πŸ“š AI-Generated Study Tips\n\n"
    for i, tip in enumerate(tips[:5], 1):
        clean_tip = tip.lstrip('- ').strip()
        advice_md += f"- {clean_tip}\n"
    
    advice_md += "\n**Pro tip**: Combine these with spaced repetition (Anki / Quizlet) for long-term retention!"
    return advice_md

def extract_possible_headings(text: str) -> str:
    """Attempt to extract potential titles and subtitles from raw text.
    This is a simple heuristic: short lines, all caps, or starting with numbers/sections."""
    lines = text.split('\n')
    headings = []
    for line in lines:
        stripped = line.strip()
        if stripped and (len(stripped) < 80) and (stripped.isupper() or re.match(r'^\d+\.?\s', stripped) or re.match(r'^[A-Z][a-z]+\s[A-Z]', stripped)):
            headings.append(stripped)
    if headings:
        return "### Extracted Possible Headings/Subtitles\n\n" + "\n- ".join([''] + headings) + "\n\n---\n\n"
    return ""

def summarize_long_text(text: str, progress=gr.Progress()) -> str:
    """Summarize long text in chunks + add AI study advice.
    Now with longer summaries per chunk and formatted as bullet points."""
    if not text or len(text.strip()) == 0:
        return "No text provided."
    
    progress(0, desc="Extracting headings...")
    # Extract possible headings first
    headings_section = extract_possible_headings(text)
    
    progress(0.1, desc="Chunking text...")
    chunks = chunk_text(text)
    
    summaries = []
    progress(0.2, desc="Summarizing chunks...")
    for i in progress.tqdm(range(len(chunks))):
        chunk = chunks[i]
        try:
            summary = summarizer(
                chunk,
                max_length=200,  # Reduced slightly for speed (compromise between length and time)
                min_length=60,   # Reduced for speed
                do_sample=False
            )[0]["summary_text"]
            cleaned = clean_text(summary)
            summaries.append(f"**Chunk {i+1} Summary:** {cleaned}")
        except Exception:
            pass  # skip problematic chunks
    
    # Format summaries as bullet points
    summary_md = "### Detailed Summary (in Bullet Points)\n\n"
    for s in summaries:
        summary_md += f"- {s}\n"
    
    progress(0.8, desc="Generating AI advice...")
    ai_advice = generate_ai_advice(summary_md)  # Use the bulleted summary for advice generation
    
    progress(1, desc="Done!")
    return headings_section + summary_md + ai_advice

def read_pdf(file) -> str:
    """Safely extract text from PDF."""
    try:
        reader = PdfReader(file)
        pages = [page.extract_text() or "" for page in reader.pages]
        return "\n".join(pages)  # Join with newlines to preserve line breaks for heading detection
    except Exception as e:
        return f"PDF read error: {str(e)}"

# =========================
# Download helper
# =========================
def create_download_file(content: str) -> str:
    """Create temporary file for Gradio file download component"""
    with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as tmp:
        tmp.write(content)
        return tmp.name

# =========================
# Main handler
# =========================
def process_input(text: str, file, progress=gr.Progress()):
    input_text = ""
    
    progress(0, desc="Reading input...")
    if file is not None:
        input_text = read_pdf(file)
    elif text.strip():
        input_text = text
    else:
        return "Please paste some text or upload a PDF.", None
    
    result = summarize_long_text(input_text, progress)
    download_path = create_download_file(result)
    
    return result, download_path

# =========================
# Gradio UI
# =========================
with gr.Blocks() as demo:
    gr.Markdown("# πŸ“„ Long Text Summarizer + AI Study Assistant")
    gr.Markdown(
        "β€’ Handles very long documents (thousands of words)\n"
        "β€’ Supports **PDF** upload or direct paste\n"
        "β€’ Runs on CPU – works on free hardware\n"
        "β€’ Gives you **longer, bullet-point summaries** with possible headings/subtitles\n"
        "β€’ Includes **5 AI-generated study tips** tailored to the content\n"
        "β€’ Download result as .txt file\n"
        "**Note**: Processing may take time for long documents on CPU (initial model load + inference). Please be patient!"
    )
    
    with gr.Row():
        text_input = gr.Textbox(
            lines=10,
            label="Paste your text here (optional)",
            placeholder="Paste lecture notes, article, book chapter...",
        )
        file_input = gr.File(
            label="Or upload a PDF",
            file_types=[".pdf"]
        )
    
    summarize_btn = gr.Button("Summarize & Get Study Tips", variant="primary")
    
    output = gr.Textbox(
        lines=16,
        label="Summary + AI-generated study advice",
        interactive=False
    )
    
    download_output = gr.File(
        label="Download full result (.txt)",
        interactive=False
    )
    
    summarize_btn.click(
        fn=process_input,
        inputs=[text_input, file_input],
        outputs=[output, download_output]
    )

demo.launch()