Spaces:

build-small-hackathon
/

LocalDuo

Running on Zero

App Files Files Community

shayekh commited on 24 days ago

Commit

5e8a8a0

verified ·

1 Parent(s): 55a83fe

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -30

app.py CHANGED Viewed

@@ -67,9 +67,15 @@ def set_stop_thinking():
 def set_kill_threads():
     global_kill_threads[0] = True
-    print(f"[STOP-THINK] set_kill_threads CALLED! Flag is now: {global_kill_threads[0]}")
     return gr.update(value="🛑 Stopping...")
 def extract_pdf_content(pdf_path, max_pages=2):
     """Extract text and images from up to max_pages of a PDF."""
@@ -236,7 +242,7 @@ def get_base64_image(image):
     return f"data:image/jpeg;base64,{img_str}"
 @spaces.GPU(duration=120)
-def extract_vocabulary(pdf_text, images, translit_lang, translit_format, target_lang, max_text_char=1500, repetition_penalty_val=1.1, partial_assistant_text=None):
     """Use Transformers to extract vocabulary from text and images."""
     global model, processor
@@ -262,7 +268,7 @@ Return ONLY a valid JSON list of dictionaries, where each dictionary has four ke
 Just output raw JSON with ```json and ``` markers, as the user will load in python.
-CRITICAL: Answer quick without very long thinking. Output the JSON array IMMEDIATELY.
 Text:
@@ -300,7 +306,7 @@ Text:
         model.to("cuda")
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         if partial_assistant_text:
-            text += partial_assistant_text + "\n</think>\n\n```json\n[\n"
         inputs = processor(
             text=[text],
@@ -353,14 +359,24 @@ Text:
         thread.start()
         force_triggered = False
         for new_text in streamer:
             output_text += new_text
             yield output_text, None
-            # Check if user clicked "Stop thinking"
-            if global_stop_thinking[0] and not force_triggered:
                 force_triggered = True
-                print("[STOP-THINK] Flag detected inside streamer loop! Killing current generation...")
                 # 1. Kill the current generation thread
                 local_stop[0] = True
@@ -378,7 +394,7 @@ Text:
                 local_stop[0] = False
                 # 3. Append the think-closing + JSON prefix
-                output_text += "\n</think>\n\n```json\n[\n"
                 yield output_text, None
                 # 4. Build new prompt with partial assistant text
@@ -391,7 +407,8 @@ Text:
                     padding=True
                 ).to("cuda")
-                # 5. Start new generation thread
                 streamer2 = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
                 thread2 = Thread(target=run_generation, args=(inputs2, streamer2, local_stop))
                 thread2.start()
@@ -399,8 +416,45 @@ Text:
                 for new_text2 in streamer2:
                     output_text += new_text2
                     yield output_text, None
-                thread2.join(timeout=10)
                 break  # Exit the outer streamer loop
         if not force_triggered:
@@ -453,7 +507,7 @@ Return ONLY a valid JSON list of dictionaries, where each dictionary has four ke
 - 'translation' (the translation into {target_lang.upper()})
 - 'explanation' (a brief grammar or context note in {target_lang.upper()}).
 No markdown formatting, just raw JSON with ```json and ``` markers.
-CRITICAL: Do NOT provide any conversational filler, thinking steps, or reasoning. Answer quick without very long thinking. Output the JSON array IMMEDIATELY.
 Korean words:
 {words_str}
@@ -537,7 +591,7 @@ def hash_file(filepath):
         return hashlib.md5(f.read(1024*1024)).hexdigest()
 @spaces.GPU(duration=120)
-def process_pdf(pdf_file, url_input, audio_file_input, yt_url_input, yt_cookies_file, translit_lang, translit_format, target_lang, max_text_char, repetition_penalty_val, last_source_hash, last_korean_words, active_tab, progress=gr.Progress()):
     global tts, voice_style
     # Clean language choices from "Family - Language" to just "Language"
@@ -548,6 +602,9 @@ def process_pdf(pdf_file, url_input, audio_file_input, yt_url_input, yt_cookies_
     os.makedirs("log", exist_ok=True)
     # Determine input source based on active tab
     is_url = (active_tab == "Website URL") and bool(url_input and url_input.strip())
     is_youtube = (active_tab == "YouTube Link") and bool(yt_url_input and yt_url_input.strip() and is_youtube_url(yt_url_input.strip()))
@@ -619,8 +676,11 @@ def process_pdf(pdf_file, url_input, audio_file_input, yt_url_input, yt_cookies_
     vocab_list = []
     stream_text = ""
     for attempt in range(1, 4):
         progress(0.2, desc=f"Extracting vocabulary (Attempt {attempt}/3)...")
-        for stream_t, v_list in extract_vocabulary(content_text, images, translit_lang, translit_format, target_lang, max_text_char, repetition_penalty_val):
             stream_text = stream_t
             if v_list is not None:
                 vocab_list = v_list
@@ -629,6 +689,31 @@ def process_pdf(pdf_file, url_input, audio_file_input, yt_url_input, yt_cookies_
         if vocab_list:
             break
     if not vocab_list:
         yield "<p>Failed to extract or translate vocabulary after 3 attempts.</p>", current_source_hash, None, stream_text, content_text, images, extracted_audio_path
         return
@@ -958,7 +1043,7 @@ def process_pdf(pdf_file, url_input, audio_file_input, yt_url_input, yt_cookies_
     # Return the iframe containing the whole SPA
     yield f'<iframe srcdoc="{safe_srcdoc}" style="width: 100%; height: 650px; border: none; overflow-y: auto;"></iframe>', current_source_hash, vocab_list, stream_text, content_text, images, extracted_audio_path
-LANGUAGE_DATA = """Indo-European	English, French, Portuguese, German, Romanian, Swedish, Danish, Bulgarian, Russian, Czech, Greek, Ukrainian, Spanish, Dutch, Slovak, Croatian, Polish, Lithuanian, Norwegian Bokmål, Norwegian Nynorsk, Persian, Slovenian, Gujarati, Latvian, Italian, Occitan, Nepali, Marathi, Belarusian, Serbian, Luxembourgish, Venetian, Assamese, Welsh, Silesian, Asturian, Chhattisgarhi, Awadhi, Maithili, Bhojpuri, Sindhi, Irish, Faroese, Hindi, Punjabi, Bengali, Oriya, Tajik, Eastern Yiddish, Lombard, Ligurian, Sicilian, Friulian, Sardinian, Galician, Catalan, Icelandic, Tosk Albanian, Limburgish, Dari, Afrikaans, Macedonian, Sinhala, Urdu, Magahi, Bosnian, Armenian, Latgalian, Scottish Gaelic, Central Kurdish, Northern Kurdish, Southern Pashto, Sanskrit, Dhundari, Marwari, Ahirani, Bagheli, Bagri, Bundeli, Braj, Kumaoni, Kashmiri
 Sino-Tibetan	Chinese (Simplified), Chinese (Traditional), Cantonese, Burmese, Standard Tibetan, Meitei
 Afro-Asiatic	Arabic (Standard), Arabic (Najdi), Arabic (Levantine), Arabic (Egyptian), Arabic (Moroccan), Arabic (Mesopotamian), Arabic (Ta’izzi-Adeni), Arabic (Tunisian), Arabic (Gulf), Arabic (Algerian), Arabic (Sudanese), Arabic (Libyan), Hebrew, Maltese, Amharic, Tigrinya, Kabyle, Somali, West Central Oromo, Hausa
 Austronesian	Indonesian, Malay, Tagalog, Cebuano, Javanese, Sundanese, Minangkabau, Balinese, Banjar, Pangasinan, Iloko, Waray (Philippines), Plateau Malagasy, Malagasy, Buginese, Maori, Samoan, Hawaiian, Fijian
@@ -967,8 +1052,8 @@ Turkic	Turkish, North Azerbaijani, Northern Uzbek, Kazakh, Bashkir, Tatar, Crime
 Tai-Kadai	Thai, Lao, Shan
 Uralic	Finnish, Estonian, Hungarian, Meadow Mari
 Austroasiatic	Vietnamese, Khmer
-Niger–Congo	Yoruba, Ewe, Kinyarwanda, Lingala, Northern Sotho, Nyanja, Shona, Southern Sotho, Tswana, Xhosa, Zulu, Luganda, Swati, Tsonga, Tumbuka, Venda, Chokwe, Luba-Kasai, Rundi, Umbundu, Kikuyu, Kongo, Nigerian Fulfulde, Wolof, Fon, Kabiyè, Mossi, Akan, Twi, Bambara, Igbo
-Other	Japanese, Korean, Georgian, Basque, Haitian, Papiamento, Kabuverdianu, Tok Pisin, Swahili, Central Aymara, Tulu, Nagamese, Nigerian Pidgin, Mauritian Creole, Sango, Ayacucho Quechua, Halh Mongolian, Southwestern Dinka, Nuer, Guarani"""
 LANGUAGE_CHOICES = []
 for line in LANGUAGE_DATA.strip().split('\n'):
@@ -1222,19 +1307,22 @@ def create_demo():
                 tab_yt.select(fn=lambda: "YouTube Link", inputs=None, outputs=active_tab)
                 gr.Markdown("### ⚙️ Customization Settings")
-                translit_lang = gr.Dropdown(
-                    label="Word Transliteration Language",
-                    choices=LANGUAGE_CHOICES,
-                    value="Indo-European - English"
-                )
-                translit_format = gr.Dropdown(label="Transliteration Format", choices=["dashed syllable", "regular word with space"], value="dashed syllable")
-                target_lang = gr.Dropdown(
-                    label="Target Language (Full App)",
-                    choices=LANGUAGE_CHOICES,
-                    value="Indo-European - English"
-                )
                 max_text_char_input = gr.Slider(minimum=1000, maximum=30000, step=1000, value=1500, label="Max Input Text Length (Characters)")
                 repetition_penalty_input = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.2, label="Repetition Penalty")
                 with gr.Row():
                     submit_btn = gr.Button("✨ Generate Flashcards ✨", variant="primary")
@@ -1260,13 +1348,13 @@ def create_demo():
         generate_event = submit_btn.click(
             fn=process_pdf,
-            inputs=[pdf_input, url_input, audio_file_input, yt_url_input, yt_cookies_input, translit_lang, translit_format, target_lang, max_text_char_input, repetition_penalty_input, last_source_state, last_korean_words_state, active_tab],
             outputs=[output_html, last_source_state, last_korean_words_state, stream_box, extracted_text_box, extracted_images_gallery, extracted_audio_player]
         )
         stop_thinking_btn.click(fn=set_stop_thinking, inputs=None, outputs=stop_thinking_btn, queue=False)
-        stop_btn.click(fn=set_kill_threads, inputs=None, outputs=stop_btn, queue=False).then(fn=None, inputs=None, outputs=None, cancels=[generate_event])
         # Force autoscroll using Custom JS
         stream_box.change(

 def set_kill_threads():
     global_kill_threads[0] = True
+    print(f"[KILL] set_kill_threads CALLED! Flag is now: {global_kill_threads[0]}")
     return gr.update(value="🛑 Stopping...")
+def reset_generation_flags():
+    """Reset all generation control flags at the start of a new generation."""
+    global_stop_thinking[0] = False
+    global_kill_threads[0] = False
+    print("[FLAGS] Reset stop_thinking and kill_threads to False")
 def extract_pdf_content(pdf_path, max_pages=2):
     """Extract text and images from up to max_pages of a PDF."""
     return f"data:image/jpeg;base64,{img_str}"
 @spaces.GPU(duration=120)
+def extract_vocabulary(pdf_text, images, translit_lang, translit_format, target_lang, max_text_char=1500, repetition_penalty_val=1.1, partial_assistant_text=None, auto_force_chars=1000):
     """Use Transformers to extract vocabulary from text and images."""
     global model, processor
 Just output raw JSON with ```json and ``` markers, as the user will load in python.
+CRITICAL: Do NOT overthink. Do NOT deliberate over conditions, edge cases, or reasoning. Keep your thinking extremely brief (a few words at most). Output the JSON array IMMEDIATELY without lengthy analysis.
 Text:
         model.to("cuda")
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         if partial_assistant_text:
+            text += partial_assistant_text + "\nReady to generate.\n</think>\n\n```json\n[\n"
         inputs = processor(
             text=[text],
         thread.start()
         force_triggered = False
+        AUTO_FORCE_CHARS = auto_force_chars
         for new_text in streamer:
             output_text += new_text
             yield output_text, None
+            # Auto-force JSON if thinking exceeds 300 chars without producing JSON
+            should_auto_force = (
+                not force_triggered
+                and not partial_assistant_text
+                and len(output_text) > AUTO_FORCE_CHARS
+                and '```json' not in output_text
+            )
+            # Check if user clicked "Stop thinking" OR auto-force threshold reached
+            if (global_stop_thinking[0] or should_auto_force) and not force_triggered:
                 force_triggered = True
+                reason = "auto-force (>300 chars)" if should_auto_force else "user clicked stop"
+                print(f"[STOP-THINK] Force triggered ({reason})! Killing current generation...")
                 # 1. Kill the current generation thread
                 local_stop[0] = True
                 local_stop[0] = False
                 # 3. Append the think-closing + JSON prefix
+                output_text += "\nReady to generate.\n</think>\n\n```json\n[\n"
                 yield output_text, None
                 # 4. Build new prompt with partial assistant text
                     padding=True
                 ).to("cuda")
+                # 5. Start new generation thread with force-JSON context
+                # This loop also monitors stop_thinking so user can force again if model keeps thinking
                 streamer2 = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
                 thread2 = Thread(target=run_generation, args=(inputs2, streamer2, local_stop))
                 thread2.start()
                 for new_text2 in streamer2:
                     output_text += new_text2
                     yield output_text, None
+                    # Allow user to force again if model still isn't producing JSON
+                    if global_stop_thinking[0] or global_kill_threads[0]:
+                        print("[STOP-THINK] Flag detected in forced generation loop! Killing...")
+                        local_stop[0] = True
+                        while not streamer2.text_queue.empty():
+                            try:
+                                streamer2.text_queue.get_nowait()
+                            except queue.Empty:
+                                break
+                        thread2.join(timeout=5)
+                        global_stop_thinking[0] = False
+                        local_stop[0] = False
+                        # Force JSON prefix again
+                        output_text += "\nReady to generate.\n</think>\n\n```json\n[\n"
+                        yield output_text, None
+                        text3 = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+                        text3 += output_text
+                        inputs3 = processor(
+                            text=[text3],
+                            images=pil_images if pil_images else None,
+                            return_tensors="pt",
+                            padding=True
+                        ).to("cuda")
+                        streamer3 = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
+                        thread3 = Thread(target=run_generation, args=(inputs3, streamer3, local_stop))
+                        thread3.start()
+                        for new_text3 in streamer3:
+                            output_text += new_text3
+                            yield output_text, None
+                        thread3.join(timeout=10)
+                        break
+                else:
+                    thread2.join(timeout=10)
                 break  # Exit the outer streamer loop
         if not force_triggered:
 - 'translation' (the translation into {target_lang.upper()})
 - 'explanation' (a brief grammar or context note in {target_lang.upper()}).
 No markdown formatting, just raw JSON with ```json and ``` markers.
+CRITICAL: Do NOT overthink. Do NOT deliberate over conditions, edge cases, or reasoning. Keep your thinking extremely brief (5 paragraphs at most). Output the JSON array IMMEDIATELY without lengthy analysis.
 Korean words:
 {words_str}
         return hashlib.md5(f.read(1024*1024)).hexdigest()
 @spaces.GPU(duration=120)
+def process_pdf(pdf_file, url_input, audio_file_input, yt_url_input, yt_cookies_file, translit_lang, translit_format, target_lang, max_text_char, repetition_penalty_val, auto_force_chars_val, last_source_hash, last_korean_words, active_tab, progress=gr.Progress()):
     global tts, voice_style
     # Clean language choices from "Family - Language" to just "Language"
     os.makedirs("log", exist_ok=True)
+    # Reset flags at start of new generation
+    reset_generation_flags()
     # Determine input source based on active tab
     is_url = (active_tab == "Website URL") and bool(url_input and url_input.strip())
     is_youtube = (active_tab == "YouTube Link") and bool(yt_url_input and yt_url_input.strip() and is_youtube_url(yt_url_input.strip()))
     vocab_list = []
     stream_text = ""
     for attempt in range(1, 4):
+        if global_kill_threads[0]:
+            print("[KILL] Kill flag detected, stopping extraction attempts.")
+            break
         progress(0.2, desc=f"Extracting vocabulary (Attempt {attempt}/3)...")
+        for stream_t, v_list in extract_vocabulary(content_text, images, translit_lang, translit_format, target_lang, max_text_char, repetition_penalty_val, auto_force_chars=auto_force_chars_val):
             stream_text = stream_t
             if v_list is not None:
                 vocab_list = v_list
         if vocab_list:
             break
+    # Reset kill flag after extraction so TTS can proceed
+    global_kill_threads[0] = False
+    # If generation was killed but we don't have vocab yet, try to salvage JSON from stream_text
+    if not vocab_list and stream_text:
+        print("[KILL] Attempting to salvage JSON from partial generation output...")
+        try:
+            import re
+            json_matches = list(re.finditer(r'```(?:json)?\s*([\s\S]*?)```', stream_text))
+            if json_matches:
+                clean_text = json_matches[-1].group(1).strip()
+            else:
+                json_matches = list(re.finditer(r'(\[[\s\S]*\]|\{[\s\S]*\})', stream_text))
+                clean_text = json_matches[-1].group(1).strip() if json_matches else ""
+            if clean_text:
+                data = json.loads(clean_text)
+                if not isinstance(data, list):
+                    data = [data]
+                if data and isinstance(data[0], dict) and 'korean' in data[0]:
+                    vocab_list = data
+                    print(f"[KILL] Salvaged {len(vocab_list)} vocab items from partial output!")
+        except Exception as e:
+            print(f"[KILL] Could not salvage JSON: {e}")
     if not vocab_list:
         yield "<p>Failed to extract or translate vocabulary after 3 attempts.</p>", current_source_hash, None, stream_text, content_text, images, extracted_audio_path
         return
     # Return the iframe containing the whole SPA
     yield f'<iframe srcdoc="{safe_srcdoc}" style="width: 100%; height: 650px; border: none; overflow-y: auto;"></iframe>', current_source_hash, vocab_list, stream_text, content_text, images, extracted_audio_path
+LANGUAGE_DATA = """Indo-European	Bengali, English, French, Portuguese, German, Romanian, Swedish, Danish, Bulgarian, Russian, Czech, Greek, Ukrainian, Spanish, Dutch, Slovak, Croatian, Polish, Lithuanian, Norwegian Bokmål, Norwegian Nynorsk, Persian, Slovenian, Gujarati, Latvian, Italian, Occitan, Nepali, Marathi, Belarusian, Serbian, Luxembourgish, Venetian, Assamese, Welsh, Silesian, Asturian, Chhattisgarhi, Awadhi, Maithili, Bhojpuri, Sindhi, Irish, Faroese, Hindi, Punjabi, Oriya, Tajik, Eastern Yiddish, Lombard, Ligurian, Sicilian, Friulian, Sardinian, Galician, Catalan, Icelandic, Tosk Albanian, Limburgish, Dari, Afrikaans, Macedonian, Sinhala, Urdu, Magahi, Bosnian, Armenian, Latgalian, Scottish Gaelic, Central Kurdish, Northern Kurdish, Southern Pashto, Sanskrit, Dhundari, Marwari, Ahirani, Bagheli, Bagri, Bundeli, Braj, Kumaoni, Kashmiri
 Sino-Tibetan	Chinese (Simplified), Chinese (Traditional), Cantonese, Burmese, Standard Tibetan, Meitei
 Afro-Asiatic	Arabic (Standard), Arabic (Najdi), Arabic (Levantine), Arabic (Egyptian), Arabic (Moroccan), Arabic (Mesopotamian), Arabic (Ta’izzi-Adeni), Arabic (Tunisian), Arabic (Gulf), Arabic (Algerian), Arabic (Sudanese), Arabic (Libyan), Hebrew, Maltese, Amharic, Tigrinya, Kabyle, Somali, West Central Oromo, Hausa
 Austronesian	Indonesian, Malay, Tagalog, Cebuano, Javanese, Sundanese, Minangkabau, Balinese, Banjar, Pangasinan, Iloko, Waray (Philippines), Plateau Malagasy, Malagasy, Buginese, Maori, Samoan, Hawaiian, Fijian
 Tai-Kadai	Thai, Lao, Shan
 Uralic	Finnish, Estonian, Hungarian, Meadow Mari
 Austroasiatic	Vietnamese, Khmer
+Niger–Congo	Yoruba, Ewe, Kinyarwanda, Lingala, Northern Sotho, Nyanja, Shona, Southern Sotho, Tswana, Xhosa, Zulu, Luganda, Swati, Tsonga, Tumbuka, Venda, Chokwe, Luba-Kasai, Rundi, Umbundu, Kikuyu, Kongo, Nigerian Fulfulde, Wolof, Fon, Kabiyè, Mossi, Akan, Twi, Bambara, Igbo"""
+# Other	Japanese, Korean, Georgian, Basque, Haitian, Papiamento, Kabuverdianu, Tok Pisin, Swahili, Central Aymara, Tulu, Nagamese, Nigerian Pidgin, Mauritian Creole, Sango, Ayacucho Quechua, Halh Mongolian, Southwestern Dinka, Nuer, Guarani
 LANGUAGE_CHOICES = []
 for line in LANGUAGE_DATA.strip().split('\n'):
                 tab_yt.select(fn=lambda: "YouTube Link", inputs=None, outputs=active_tab)
                 gr.Markdown("### ⚙️ Customization Settings")
                 max_text_char_input = gr.Slider(minimum=1000, maximum=30000, step=1000, value=1500, label="Max Input Text Length (Characters)")
                 repetition_penalty_input = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.2, label="Repetition Penalty")
+                auto_force_chars_input = gr.Slider(minimum=100, maximum=5000, step=100, value=1000, label="Auto-force JSON after (chars of thinking)")
+                with gr.Accordion("🔧 Advanced", open=False):
+                    translit_lang = gr.Dropdown(
+                        label="Word Transliteration Language",
+                        choices=LANGUAGE_CHOICES,
+                        value="Indo-European - English"
+                    )
+                    translit_format = gr.Dropdown(label="Transliteration Format", choices=["dashed syllable", "regular word with space"], value="dashed syllable")
+                    target_lang = gr.Dropdown(
+                        label="Target Language (Full App)",
+                        choices=LANGUAGE_CHOICES,
+                        value="Indo-European - English"
+                    )
                 with gr.Row():
                     submit_btn = gr.Button("✨ Generate Flashcards ✨", variant="primary")
         generate_event = submit_btn.click(
             fn=process_pdf,
+            inputs=[pdf_input, url_input, audio_file_input, yt_url_input, yt_cookies_input, translit_lang, translit_format, target_lang, max_text_char_input, repetition_penalty_input, auto_force_chars_input, last_source_state, last_korean_words_state, active_tab],
             outputs=[output_html, last_source_state, last_korean_words_state, stream_box, extracted_text_box, extracted_images_gallery, extracted_audio_player]
         )
         stop_thinking_btn.click(fn=set_stop_thinking, inputs=None, outputs=stop_thinking_btn, queue=False)
+        stop_btn.click(fn=set_kill_threads, inputs=None, outputs=stop_btn, queue=False)
         # Force autoscroll using Custom JS
         stream_box.change(