Update app.py
Browse files
app.py
CHANGED
|
@@ -2,6 +2,8 @@ import gradio as gr
|
|
| 2 |
import re
|
| 3 |
from transformers import pipeline, AutoTokenizer
|
| 4 |
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
| 5 |
|
| 6 |
# =========================
|
| 7 |
# Model setup (CPU-safe)
|
|
@@ -17,11 +19,57 @@ summarizer = pipeline(
|
|
| 17 |
|
| 18 |
CHUNK_SIZE = 900 # safe margin
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
# =========================
|
| 21 |
# Utilities
|
| 22 |
# =========================
|
| 23 |
def clean_text(text: str) -> str:
|
| 24 |
-
"""Fix quotes, spacing, repetition, and broken punctuation."""
|
| 25 |
text = text.replace("β", "'").replace("β", "'")
|
| 26 |
text = text.replace("β", '"').replace("β", '"')
|
| 27 |
text = re.sub(r"[.]{2,}", ".", text)
|
|
@@ -39,7 +87,6 @@ def clean_text(text: str) -> str:
|
|
| 39 |
|
| 40 |
|
| 41 |
def chunk_text(text: str):
|
| 42 |
-
"""Token-aware chunking to avoid model overflow."""
|
| 43 |
tokens = tokenizer.encode(text, add_special_tokens=False)
|
| 44 |
chunks = []
|
| 45 |
for i in range(0, len(tokens), CHUNK_SIZE):
|
|
@@ -49,8 +96,47 @@ def chunk_text(text: str):
|
|
| 49 |
return chunks
|
| 50 |
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
def summarize_long_text(text: str) -> str:
|
| 53 |
-
"""Summarize arbitrarily long text safely."""
|
| 54 |
if not text or len(text.strip()) == 0:
|
| 55 |
return "No text provided."
|
| 56 |
|
|
@@ -69,32 +155,13 @@ def summarize_long_text(text: str) -> str:
|
|
| 69 |
merged = " ".join(summaries)
|
| 70 |
cleaned_summary = clean_text(merged)
|
| 71 |
|
| 72 |
-
#
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
---
|
| 76 |
-
|
| 77 |
-
### π How to Study This Summary Effectively
|
| 78 |
-
|
| 79 |
-
Here are some proven techniques to help you learn and remember the material better:
|
| 80 |
-
|
| 81 |
-
- **Active Recall** β Cover the summary (or close your eyes) and try to explain each main point in your own words. This is one of the most powerful ways to strengthen memory.
|
| 82 |
-
- **Spaced Repetition** β Review this summary today, again in 2β3 days, then in one week. Use free apps like Anki or Quizlet to turn key points into flashcards.
|
| 83 |
-
- **Feynman Technique** β Pretend you're teaching this topic to a friend (or a 12-year-old). Explaining it simply reveals what you truly understand.
|
| 84 |
-
- **Self-Testing** β Create 3β5 questions from the summary (e.g. βWhat isβ¦?β, βWhy doesβ¦?β, βGive an example ofβ¦β). Answer them without looking.
|
| 85 |
-
- **Make Connections** β Draw a quick mind map or diagram linking the main ideas together. This helps see the big picture.
|
| 86 |
-
- **Apply It** β If possible, solve related problems, write a short paragraph, or discuss the topic with someone.
|
| 87 |
-
|
| 88 |
-
Re-reading alone is weak β **active engagement** is what makes information stick!
|
| 89 |
-
|
| 90 |
-
Good luck with your studies! π
|
| 91 |
-
"""
|
| 92 |
|
| 93 |
-
return cleaned_summary +
|
| 94 |
|
| 95 |
|
| 96 |
def read_pdf(file) -> str:
|
| 97 |
-
"""Safely extract text from PDF."""
|
| 98 |
try:
|
| 99 |
reader = PdfReader(file)
|
| 100 |
pages = [page.extract_text() or "" for page in reader.pages]
|
|
@@ -116,18 +183,18 @@ def process_input(text, file):
|
|
| 116 |
# Gradio UI
|
| 117 |
# =========================
|
| 118 |
with gr.Blocks() as demo:
|
| 119 |
-
gr.Markdown("# π Long Text Summarizer
|
| 120 |
gr.Markdown(
|
| 121 |
"β’ Handles **thousands of words**\n"
|
| 122 |
"β’ Supports **PDF upload**\n"
|
| 123 |
"β’ Optimized for **CPU / free tier**\n"
|
| 124 |
-
"β’ Includes **study tips** to
|
| 125 |
)
|
| 126 |
|
| 127 |
text_input = gr.Textbox(
|
| 128 |
lines=15,
|
| 129 |
label="Paste text (optional)",
|
| 130 |
-
placeholder="Paste
|
| 131 |
)
|
| 132 |
|
| 133 |
file_input = gr.File(
|
|
@@ -136,9 +203,9 @@ with gr.Blocks() as demo:
|
|
| 136 |
)
|
| 137 |
|
| 138 |
output = gr.Textbox(
|
| 139 |
-
lines=
|
| 140 |
-
label="Summary + Study Advice",
|
| 141 |
-
placeholder="
|
| 142 |
)
|
| 143 |
|
| 144 |
summarize_btn = gr.Button("Summarize & Get Study Tips", variant="primary")
|
|
|
|
| 2 |
import re
|
| 3 |
from transformers import pipeline, AutoTokenizer
|
| 4 |
from PyPDF2 import PdfReader
|
| 5 |
+
from collections import Counter
|
| 6 |
+
import string
|
| 7 |
|
| 8 |
# =========================
|
| 9 |
# Model setup (CPU-safe)
|
|
|
|
| 19 |
|
| 20 |
CHUNK_SIZE = 900 # safe margin
|
| 21 |
|
| 22 |
+
# =========================
|
| 23 |
+
# Subject-specific tip triggers (expandable)
|
| 24 |
+
# =========================
|
| 25 |
+
SUBJECT_TIPS = {
|
| 26 |
+
"math": [
|
| 27 |
+
"Practice similar problems step-by-step β repetition builds fluency.",
|
| 28 |
+
"Focus on understanding formulas and when to apply them.",
|
| 29 |
+
"Work backwards from answers to see common mistake patterns."
|
| 30 |
+
],
|
| 31 |
+
"equation": SUBJECT_TIPS["math"] if "math" not in SUBJECT_TIPS else [], # alias
|
| 32 |
+
"formula": SUBJECT_TIPS["math"],
|
| 33 |
+
"physics": [
|
| 34 |
+
"Draw free-body diagrams or sketch scenarios to visualize forces/concepts.",
|
| 35 |
+
"Practice unit conversions and dimensional analysis first.",
|
| 36 |
+
"Solve numerical examples to connect theory to real numbers."
|
| 37 |
+
],
|
| 38 |
+
"chemistry": [
|
| 39 |
+
"Draw reaction mechanisms and label reactants/products.",
|
| 40 |
+
"Make flashcards for periodic trends, solubility rules, or functional groups.",
|
| 41 |
+
"Balance equations repeatedly until it's automatic."
|
| 42 |
+
],
|
| 43 |
+
"biology": [
|
| 44 |
+
"Draw and label diagrams (cells, cycles, anatomy) from memory.",
|
| 45 |
+
"Use mnemonics for processes (e.g., Krebs cycle steps).",
|
| 46 |
+
"Compare/contrast similar concepts (mitosis vs meiosis)."
|
| 47 |
+
],
|
| 48 |
+
"history": [
|
| 49 |
+
"Create a timeline or flowchart of events and causes/effects.",
|
| 50 |
+
"Make cause-effect chains and link them to bigger themes.",
|
| 51 |
+
"Quiz yourself on dates, people, and turning points."
|
| 52 |
+
],
|
| 53 |
+
"literature": [
|
| 54 |
+
"Identify themes, symbols, and character development β write short explanations.",
|
| 55 |
+
"Compare this text to others you've read.",
|
| 56 |
+
"Practice essay-style answers: thesis + evidence + analysis."
|
| 57 |
+
],
|
| 58 |
+
# Add more categories as needed: economics, programming, law, etc.
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
GENERAL_TIPS = [
|
| 62 |
+
"Use **Active Recall**: Cover the summary and explain key points out loud or in writing.",
|
| 63 |
+
"Apply **Spaced Repetition**: Review today, in 2β3 days, then in a week (try Anki).",
|
| 64 |
+
"Use **Feynman Technique**: Explain it simply as if teaching a younger student.",
|
| 65 |
+
"Create 3β5 self-test questions from the summary and answer without looking.",
|
| 66 |
+
"Draw a quick mind map connecting the main ideas."
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
# =========================
|
| 70 |
# Utilities
|
| 71 |
# =========================
|
| 72 |
def clean_text(text: str) -> str:
|
|
|
|
| 73 |
text = text.replace("β", "'").replace("β", "'")
|
| 74 |
text = text.replace("β", '"').replace("β", '"')
|
| 75 |
text = re.sub(r"[.]{2,}", ".", text)
|
|
|
|
| 87 |
|
| 88 |
|
| 89 |
def chunk_text(text: str):
|
|
|
|
| 90 |
tokens = tokenizer.encode(text, add_special_tokens=False)
|
| 91 |
chunks = []
|
| 92 |
for i in range(0, len(tokens), CHUNK_SIZE):
|
|
|
|
| 96 |
return chunks
|
| 97 |
|
| 98 |
|
| 99 |
+
def get_simple_keywords(summary: str, top_n=15):
|
| 100 |
+
"""Very basic keyword extraction: most frequent words (after removing stop/punct)."""
|
| 101 |
+
text = summary.lower()
|
| 102 |
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
| 103 |
+
words = text.split()
|
| 104 |
+
stop_words = {"the", "a", "an", "and", "or", "but", "is", "are", "was", "were", "this", "that", "these", "those", "in", "on", "at", "to", "of", "for", "with", "by", "from", "as", "it", "its"}
|
| 105 |
+
filtered = [w for w in words if w not in stop_words and len(w) > 2]
|
| 106 |
+
counter = Counter(filtered)
|
| 107 |
+
return [word for word, _ in counter.most_common(top_n)]
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def generate_dynamic_advice(summary: str):
|
| 111 |
+
keywords = get_simple_keywords(summary)
|
| 112 |
+
|
| 113 |
+
detected_tips = []
|
| 114 |
+
seen_categories = set()
|
| 115 |
+
|
| 116 |
+
for word in keywords:
|
| 117 |
+
for category, tips in SUBJECT_TIPS.items():
|
| 118 |
+
if category in word and category not in seen_categories:
|
| 119 |
+
detected_tips.extend(tips[:2]) # take up to 2 per category
|
| 120 |
+
seen_categories.add(category)
|
| 121 |
+
|
| 122 |
+
# Always add 3β4 general ones
|
| 123 |
+
selected_general = GENERAL_TIPS[:4] # or random.sample if you import random
|
| 124 |
+
|
| 125 |
+
all_tips = detected_tips + selected_general
|
| 126 |
+
|
| 127 |
+
if not all_tips:
|
| 128 |
+
all_tips = GENERAL_TIPS[:4]
|
| 129 |
+
|
| 130 |
+
advice_md = "\n\n---\n\n### π Personalized Study Tips (based on content)\n\n"
|
| 131 |
+
for tip in all_tips:
|
| 132 |
+
advice_md += f"- {tip}\n"
|
| 133 |
+
|
| 134 |
+
advice_md += "\n**Pro tip**: Rewrite the summary in your own words after 24 hours β this locks in understanding!\n"
|
| 135 |
+
|
| 136 |
+
return advice_md
|
| 137 |
+
|
| 138 |
+
|
| 139 |
def summarize_long_text(text: str) -> str:
|
|
|
|
| 140 |
if not text or len(text.strip()) == 0:
|
| 141 |
return "No text provided."
|
| 142 |
|
|
|
|
| 155 |
merged = " ".join(summaries)
|
| 156 |
cleaned_summary = clean_text(merged)
|
| 157 |
|
| 158 |
+
# Dynamic advice
|
| 159 |
+
dynamic_advice = generate_dynamic_advice(cleaned_summary)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
+
return cleaned_summary + dynamic_advice
|
| 162 |
|
| 163 |
|
| 164 |
def read_pdf(file) -> str:
|
|
|
|
| 165 |
try:
|
| 166 |
reader = PdfReader(file)
|
| 167 |
pages = [page.extract_text() or "" for page in reader.pages]
|
|
|
|
| 183 |
# Gradio UI
|
| 184 |
# =========================
|
| 185 |
with gr.Blocks() as demo:
|
| 186 |
+
gr.Markdown("# π Long Text Summarizer + Study Assistant")
|
| 187 |
gr.Markdown(
|
| 188 |
"β’ Handles **thousands of words**\n"
|
| 189 |
"β’ Supports **PDF upload**\n"
|
| 190 |
"β’ Optimized for **CPU / free tier**\n"
|
| 191 |
+
"β’ Includes **general + dynamic study tips** tailored to content keywords"
|
| 192 |
)
|
| 193 |
|
| 194 |
text_input = gr.Textbox(
|
| 195 |
lines=15,
|
| 196 |
label="Paste text (optional)",
|
| 197 |
+
placeholder="Paste lecture notes, textbook chapter, article..."
|
| 198 |
)
|
| 199 |
|
| 200 |
file_input = gr.File(
|
|
|
|
| 203 |
)
|
| 204 |
|
| 205 |
output = gr.Textbox(
|
| 206 |
+
lines=16,
|
| 207 |
+
label="Summary + Personalized Study Advice",
|
| 208 |
+
placeholder="Summary appears first, followed by tailored learning tips..."
|
| 209 |
)
|
| 210 |
|
| 211 |
summarize_btn = gr.Button("Summarize & Get Study Tips", variant="primary")
|