Update app.py
Browse files
app.py
CHANGED
|
@@ -28,8 +28,6 @@ SUBJECT_TIPS = {
|
|
| 28 |
"Focus on understanding formulas and when to apply them.",
|
| 29 |
"Work backwards from answers to see common mistake patterns."
|
| 30 |
],
|
| 31 |
-
"equation": SUBJECT_TIPS["math"] if "math" not in SUBJECT_TIPS else [], # alias
|
| 32 |
-
"formula": SUBJECT_TIPS["math"],
|
| 33 |
"physics": [
|
| 34 |
"Draw free-body diagrams or sketch scenarios to visualize forces/concepts.",
|
| 35 |
"Practice unit conversions and dimensional analysis first.",
|
|
@@ -55,9 +53,14 @@ SUBJECT_TIPS = {
|
|
| 55 |
"Compare this text to others you've read.",
|
| 56 |
"Practice essay-style answers: thesis + evidence + analysis."
|
| 57 |
],
|
| 58 |
-
# Add more categories as needed: economics, programming, law, etc.
|
| 59 |
}
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
GENERAL_TIPS = [
|
| 62 |
"Use **Active Recall**: Cover the summary and explain key points out loud or in writing.",
|
| 63 |
"Apply **Spaced Repetition**: Review today, in 2β3 days, then in a week (try Anki).",
|
|
@@ -70,6 +73,7 @@ GENERAL_TIPS = [
|
|
| 70 |
# Utilities
|
| 71 |
# =========================
|
| 72 |
def clean_text(text: str) -> str:
|
|
|
|
| 73 |
text = text.replace("β", "'").replace("β", "'")
|
| 74 |
text = text.replace("β", '"').replace("β", '"')
|
| 75 |
text = re.sub(r"[.]{2,}", ".", text)
|
|
@@ -87,6 +91,7 @@ def clean_text(text: str) -> str:
|
|
| 87 |
|
| 88 |
|
| 89 |
def chunk_text(text: str):
|
|
|
|
| 90 |
tokens = tokenizer.encode(text, add_special_tokens=False)
|
| 91 |
chunks = []
|
| 92 |
for i in range(0, len(tokens), CHUNK_SIZE):
|
|
@@ -97,11 +102,15 @@ def chunk_text(text: str):
|
|
| 97 |
|
| 98 |
|
| 99 |
def get_simple_keywords(summary: str, top_n=15):
|
| 100 |
-
"""Very basic keyword extraction: most frequent words
|
| 101 |
text = summary.lower()
|
| 102 |
text = text.translate(str.maketrans("", "", string.punctuation))
|
| 103 |
words = text.split()
|
| 104 |
-
stop_words = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
filtered = [w for w in words if w not in stop_words and len(w) > 2]
|
| 106 |
counter = Counter(filtered)
|
| 107 |
return [word for word, _ in counter.most_common(top_n)]
|
|
@@ -116,11 +125,11 @@ def generate_dynamic_advice(summary: str):
|
|
| 116 |
for word in keywords:
|
| 117 |
for category, tips in SUBJECT_TIPS.items():
|
| 118 |
if category in word and category not in seen_categories:
|
| 119 |
-
detected_tips.extend(tips[:2]) #
|
| 120 |
seen_categories.add(category)
|
| 121 |
|
| 122 |
-
# Always
|
| 123 |
-
selected_general = GENERAL_TIPS[:4]
|
| 124 |
|
| 125 |
all_tips = detected_tips + selected_general
|
| 126 |
|
|
@@ -131,12 +140,13 @@ def generate_dynamic_advice(summary: str):
|
|
| 131 |
for tip in all_tips:
|
| 132 |
advice_md += f"- {tip}\n"
|
| 133 |
|
| 134 |
-
advice_md += "\n**Pro tip**:
|
| 135 |
|
| 136 |
return advice_md
|
| 137 |
|
| 138 |
|
| 139 |
def summarize_long_text(text: str) -> str:
|
|
|
|
| 140 |
if not text or len(text.strip()) == 0:
|
| 141 |
return "No text provided."
|
| 142 |
|
|
@@ -155,13 +165,14 @@ def summarize_long_text(text: str) -> str:
|
|
| 155 |
merged = " ".join(summaries)
|
| 156 |
cleaned_summary = clean_text(merged)
|
| 157 |
|
| 158 |
-
#
|
| 159 |
dynamic_advice = generate_dynamic_advice(cleaned_summary)
|
| 160 |
|
| 161 |
return cleaned_summary + dynamic_advice
|
| 162 |
|
| 163 |
|
| 164 |
def read_pdf(file) -> str:
|
|
|
|
| 165 |
try:
|
| 166 |
reader = PdfReader(file)
|
| 167 |
pages = [page.extract_text() or "" for page in reader.pages]
|
|
@@ -188,7 +199,7 @@ with gr.Blocks() as demo:
|
|
| 188 |
"β’ Handles **thousands of words**\n"
|
| 189 |
"β’ Supports **PDF upload**\n"
|
| 190 |
"β’ Optimized for **CPU / free tier**\n"
|
| 191 |
-
"β’ Includes **general + dynamic study tips**
|
| 192 |
)
|
| 193 |
|
| 194 |
text_input = gr.Textbox(
|
|
|
|
| 28 |
"Focus on understanding formulas and when to apply them.",
|
| 29 |
"Work backwards from answers to see common mistake patterns."
|
| 30 |
],
|
|
|
|
|
|
|
| 31 |
"physics": [
|
| 32 |
"Draw free-body diagrams or sketch scenarios to visualize forces/concepts.",
|
| 33 |
"Practice unit conversions and dimensional analysis first.",
|
|
|
|
| 53 |
"Compare this text to others you've read.",
|
| 54 |
"Practice essay-style answers: thesis + evidence + analysis."
|
| 55 |
],
|
|
|
|
| 56 |
}
|
| 57 |
|
| 58 |
+
# Add aliases safely AFTER the dictionary is fully defined
|
| 59 |
+
SUBJECT_TIPS["equation"] = SUBJECT_TIPS["math"]
|
| 60 |
+
SUBJECT_TIPS["formula"] = SUBJECT_TIPS["math"]
|
| 61 |
+
# You can easily add more: SUBJECT_TIPS["calculus"] = SUBJECT_TIPS["math"]
|
| 62 |
+
# SUBJECT_TIPS["algebra"] = SUBJECT_TIPS["math"] etc.
|
| 63 |
+
|
| 64 |
GENERAL_TIPS = [
|
| 65 |
"Use **Active Recall**: Cover the summary and explain key points out loud or in writing.",
|
| 66 |
"Apply **Spaced Repetition**: Review today, in 2β3 days, then in a week (try Anki).",
|
|
|
|
| 73 |
# Utilities
|
| 74 |
# =========================
|
| 75 |
def clean_text(text: str) -> str:
|
| 76 |
+
"""Fix quotes, spacing, repetition, and broken punctuation."""
|
| 77 |
text = text.replace("β", "'").replace("β", "'")
|
| 78 |
text = text.replace("β", '"').replace("β", '"')
|
| 79 |
text = re.sub(r"[.]{2,}", ".", text)
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
def chunk_text(text: str):
|
| 94 |
+
"""Token-aware chunking to avoid model overflow."""
|
| 95 |
tokens = tokenizer.encode(text, add_special_tokens=False)
|
| 96 |
chunks = []
|
| 97 |
for i in range(0, len(tokens), CHUNK_SIZE):
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
def get_simple_keywords(summary: str, top_n=15):
|
| 105 |
+
"""Very basic keyword extraction: most frequent meaningful words."""
|
| 106 |
text = summary.lower()
|
| 107 |
text = text.translate(str.maketrans("", "", string.punctuation))
|
| 108 |
words = text.split()
|
| 109 |
+
stop_words = {
|
| 110 |
+
"the", "a", "an", "and", "or", "but", "is", "are", "was", "were",
|
| 111 |
+
"this", "that", "these", "those", "in", "on", "at", "to", "of",
|
| 112 |
+
"for", "with", "by", "from", "as", "it", "its", "be", "have", "has"
|
| 113 |
+
}
|
| 114 |
filtered = [w for w in words if w not in stop_words and len(w) > 2]
|
| 115 |
counter = Counter(filtered)
|
| 116 |
return [word for word, _ in counter.most_common(top_n)]
|
|
|
|
| 125 |
for word in keywords:
|
| 126 |
for category, tips in SUBJECT_TIPS.items():
|
| 127 |
if category in word and category not in seen_categories:
|
| 128 |
+
detected_tips.extend(tips[:2]) # max 2 tips per matched category
|
| 129 |
seen_categories.add(category)
|
| 130 |
|
| 131 |
+
# Always include some general advice
|
| 132 |
+
selected_general = GENERAL_TIPS[:4]
|
| 133 |
|
| 134 |
all_tips = detected_tips + selected_general
|
| 135 |
|
|
|
|
| 140 |
for tip in all_tips:
|
| 141 |
advice_md += f"- {tip}\n"
|
| 142 |
|
| 143 |
+
advice_md += "\n**Pro tip**: Try rewriting the main ideas in your own words after 24 hours β it really helps long-term retention!\n"
|
| 144 |
|
| 145 |
return advice_md
|
| 146 |
|
| 147 |
|
| 148 |
def summarize_long_text(text: str) -> str:
|
| 149 |
+
"""Summarize arbitrarily long text safely + add study advice."""
|
| 150 |
if not text or len(text.strip()) == 0:
|
| 151 |
return "No text provided."
|
| 152 |
|
|
|
|
| 165 |
merged = " ".join(summaries)
|
| 166 |
cleaned_summary = clean_text(merged)
|
| 167 |
|
| 168 |
+
# Generate dynamic study advice
|
| 169 |
dynamic_advice = generate_dynamic_advice(cleaned_summary)
|
| 170 |
|
| 171 |
return cleaned_summary + dynamic_advice
|
| 172 |
|
| 173 |
|
| 174 |
def read_pdf(file) -> str:
|
| 175 |
+
"""Safely extract text from PDF."""
|
| 176 |
try:
|
| 177 |
reader = PdfReader(file)
|
| 178 |
pages = [page.extract_text() or "" for page in reader.pages]
|
|
|
|
| 199 |
"β’ Handles **thousands of words**\n"
|
| 200 |
"β’ Supports **PDF upload**\n"
|
| 201 |
"β’ Optimized for **CPU / free tier**\n"
|
| 202 |
+
"β’ Includes **general + dynamic study tips** based on content keywords"
|
| 203 |
)
|
| 204 |
|
| 205 |
text_input = gr.Textbox(
|