Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,8 @@ import tempfile
|
|
| 7 |
# =========================
|
| 8 |
# Model setup (CPU-safe)
|
| 9 |
# =========================
|
| 10 |
-
|
|
|
|
| 11 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 12 |
summarizer = pipeline(
|
| 13 |
"summarization",
|
|
@@ -16,10 +17,10 @@ summarizer = pipeline(
|
|
| 16 |
device=-1 # CPU only
|
| 17 |
)
|
| 18 |
|
| 19 |
-
#
|
| 20 |
advice_generator = pipeline(
|
| 21 |
"text2text-generation",
|
| 22 |
-
model="google/flan-t5-
|
| 23 |
device=-1 # CPU only
|
| 24 |
)
|
| 25 |
|
|
@@ -101,28 +102,32 @@ def extract_possible_headings(text: str) -> str:
|
|
| 101 |
return "### Extracted Possible Headings/Subtitles\n\n" + "\n- ".join([''] + headings) + "\n\n---\n\n"
|
| 102 |
return ""
|
| 103 |
|
| 104 |
-
def summarize_long_text(text: str) -> str:
|
| 105 |
"""Summarize long text in chunks + add AI study advice.
|
| 106 |
Now with longer summaries per chunk and formatted as bullet points."""
|
| 107 |
if not text or len(text.strip()) == 0:
|
| 108 |
return "No text provided."
|
| 109 |
|
|
|
|
| 110 |
# Extract possible headings first
|
| 111 |
headings_section = extract_possible_headings(text)
|
| 112 |
|
|
|
|
| 113 |
chunks = chunk_text(text)
|
| 114 |
-
summaries = []
|
| 115 |
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
| 117 |
try:
|
| 118 |
summary = summarizer(
|
| 119 |
chunk,
|
| 120 |
-
max_length=
|
| 121 |
-
min_length=
|
| 122 |
do_sample=False
|
| 123 |
)[0]["summary_text"]
|
| 124 |
cleaned = clean_text(summary)
|
| 125 |
-
summaries.append(f"**Chunk {i} Summary:** {cleaned}")
|
| 126 |
except Exception:
|
| 127 |
pass # skip problematic chunks
|
| 128 |
|
|
@@ -131,8 +136,10 @@ def summarize_long_text(text: str) -> str:
|
|
| 131 |
for s in summaries:
|
| 132 |
summary_md += f"- {s}\n"
|
| 133 |
|
|
|
|
| 134 |
ai_advice = generate_ai_advice(summary_md) # Use the bulleted summary for advice generation
|
| 135 |
|
|
|
|
| 136 |
return headings_section + summary_md + ai_advice
|
| 137 |
|
| 138 |
def read_pdf(file) -> str:
|
|
@@ -156,9 +163,10 @@ def create_download_file(content: str) -> str:
|
|
| 156 |
# =========================
|
| 157 |
# Main handler
|
| 158 |
# =========================
|
| 159 |
-
def process_input(text: str, file):
|
| 160 |
input_text = ""
|
| 161 |
|
|
|
|
| 162 |
if file is not None:
|
| 163 |
input_text = read_pdf(file)
|
| 164 |
elif text.strip():
|
|
@@ -166,7 +174,7 @@ def process_input(text: str, file):
|
|
| 166 |
else:
|
| 167 |
return "Please paste some text or upload a PDF.", None
|
| 168 |
|
| 169 |
-
result = summarize_long_text(input_text)
|
| 170 |
download_path = create_download_file(result)
|
| 171 |
|
| 172 |
return result, download_path
|
|
@@ -182,7 +190,8 @@ with gr.Blocks() as demo:
|
|
| 182 |
"• Runs on CPU – works on free hardware\n"
|
| 183 |
"• Gives you **longer, bullet-point summaries** with possible headings/subtitles\n"
|
| 184 |
"• Includes **5 AI-generated study tips** tailored to the content\n"
|
| 185 |
-
"• Download result as .txt file"
|
|
|
|
| 186 |
)
|
| 187 |
|
| 188 |
with gr.Row():
|
|
|
|
| 7 |
# =========================
|
| 8 |
# Model setup (CPU-safe)
|
| 9 |
# =========================
|
| 10 |
+
# Use smaller, faster models to speed up processing
|
| 11 |
+
MODEL_NAME = "sshleifer/distilbart-cnn-6-6" # Smaller than 12-6, faster on CPU
|
| 12 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 13 |
summarizer = pipeline(
|
| 14 |
"summarization",
|
|
|
|
| 17 |
device=-1 # CPU only
|
| 18 |
)
|
| 19 |
|
| 20 |
+
# Use smaller flan-t5-small for faster advice generation
|
| 21 |
advice_generator = pipeline(
|
| 22 |
"text2text-generation",
|
| 23 |
+
model="google/flan-t5-small",
|
| 24 |
device=-1 # CPU only
|
| 25 |
)
|
| 26 |
|
|
|
|
| 102 |
return "### Extracted Possible Headings/Subtitles\n\n" + "\n- ".join([''] + headings) + "\n\n---\n\n"
|
| 103 |
return ""
|
| 104 |
|
| 105 |
+
def summarize_long_text(text: str, progress=gr.Progress()) -> str:
|
| 106 |
"""Summarize long text in chunks + add AI study advice.
|
| 107 |
Now with longer summaries per chunk and formatted as bullet points."""
|
| 108 |
if not text or len(text.strip()) == 0:
|
| 109 |
return "No text provided."
|
| 110 |
|
| 111 |
+
progress(0, desc="Extracting headings...")
|
| 112 |
# Extract possible headings first
|
| 113 |
headings_section = extract_possible_headings(text)
|
| 114 |
|
| 115 |
+
progress(0.1, desc="Chunking text...")
|
| 116 |
chunks = chunk_text(text)
|
|
|
|
| 117 |
|
| 118 |
+
summaries = []
|
| 119 |
+
progress(0.2, desc="Summarizing chunks...")
|
| 120 |
+
for i in progress.tqdm(range(len(chunks))):
|
| 121 |
+
chunk = chunks[i]
|
| 122 |
try:
|
| 123 |
summary = summarizer(
|
| 124 |
chunk,
|
| 125 |
+
max_length=200, # Reduced slightly for speed (compromise between length and time)
|
| 126 |
+
min_length=60, # Reduced for speed
|
| 127 |
do_sample=False
|
| 128 |
)[0]["summary_text"]
|
| 129 |
cleaned = clean_text(summary)
|
| 130 |
+
summaries.append(f"**Chunk {i+1} Summary:** {cleaned}")
|
| 131 |
except Exception:
|
| 132 |
pass # skip problematic chunks
|
| 133 |
|
|
|
|
| 136 |
for s in summaries:
|
| 137 |
summary_md += f"- {s}\n"
|
| 138 |
|
| 139 |
+
progress(0.8, desc="Generating AI advice...")
|
| 140 |
ai_advice = generate_ai_advice(summary_md) # Use the bulleted summary for advice generation
|
| 141 |
|
| 142 |
+
progress(1, desc="Done!")
|
| 143 |
return headings_section + summary_md + ai_advice
|
| 144 |
|
| 145 |
def read_pdf(file) -> str:
|
|
|
|
| 163 |
# =========================
|
| 164 |
# Main handler
|
| 165 |
# =========================
|
| 166 |
+
def process_input(text: str, file, progress=gr.Progress()):
|
| 167 |
input_text = ""
|
| 168 |
|
| 169 |
+
progress(0, desc="Reading input...")
|
| 170 |
if file is not None:
|
| 171 |
input_text = read_pdf(file)
|
| 172 |
elif text.strip():
|
|
|
|
| 174 |
else:
|
| 175 |
return "Please paste some text or upload a PDF.", None
|
| 176 |
|
| 177 |
+
result = summarize_long_text(input_text, progress)
|
| 178 |
download_path = create_download_file(result)
|
| 179 |
|
| 180 |
return result, download_path
|
|
|
|
| 190 |
"• Runs on CPU – works on free hardware\n"
|
| 191 |
"• Gives you **longer, bullet-point summaries** with possible headings/subtitles\n"
|
| 192 |
"• Includes **5 AI-generated study tips** tailored to the content\n"
|
| 193 |
+
"• Download result as .txt file\n"
|
| 194 |
+
"**Note**: Processing may take time for long documents on CPU (initial model load + inference). Please be patient!"
|
| 195 |
)
|
| 196 |
|
| 197 |
with gr.Row():
|