Spaces:

build-small-hackathon
/

LocalDuo

Running on Zero

App Files Files Community

shayekh commited on 14 days ago

Commit

612024d

verified ·

1 Parent(s): 9e87252

Update app.py

Browse files

Files changed (1) hide show

app.py +484 -4

app.py CHANGED Viewed

@@ -1,7 +1,487 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+# Copyright: Shayekh Bin Islam. KAIST, South Korea. 2026.
 import gradio as gr
+import fitz  # PyMuPDF
+from PIL import Image
+import io
+import json
+import base64
+import soundfile as sf
+import torch
+from supertonic import TTS
+from vllm import LLM, SamplingParams
+llm = None
+sampling_params = None
+tts = None
+voice_style = None
+def extract_pdf_content(pdf_path, max_pages=2):
+    """Extract text and images from up to max_pages of a PDF."""
+    doc = fitz.open(pdf_path)
+    text = ""
+    images = []
+    for i in range(min(max_pages, len(doc))):
+        page = doc[i]
+        text += page.get_text() + "\n"
+        pix = page.get_pixmap(dpi=150)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        images.append(img)
+    return text, images
+import os
+def get_base64_image(image):
+    buffered = io.BytesIO()
+    image.save(buffered, format="JPEG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return f"data:image/jpeg;base64,{img_str}"
+def extract_vocabulary(pdf_text, images, translit_lang, translit_format, target_lang):
+    """Use vLLM to extract vocabulary from text and images."""
+    global llm, sampling_params
+    os.makedirs("log", exist_ok=True)
+    prompt_text = f"""Extract 3 to 5 key Korean words or phrases from the following text and images.
+Return ONLY a valid JSON list of dictionaries, where each dictionary has four keys:
+- 'korean' (the Korean text)
+- 'transliteration' (the pronunciation transliterated into {translit_lang.upper()} script/characters, formatted as {translit_format}. CRITICAL: You MUST use the native alphabet/script of {translit_lang.upper()}, do NOT use English letters unless requested.)
+- 'translation' (the translation into {target_lang.upper()})
+- 'explanation' (a brief grammar or context note in {target_lang.upper()}).
+No markdown formatting, just raw JSON.
+Text:
+{pdf_text[:1500]}
+"""
+    # DEBUG: Log prompt text
+    with open("log/debug_vlm_prompt.txt", "w", encoding="utf-8") as f:
+        f.write(prompt_text)
+    content = [{"type": "text", "text": prompt_text}]
+    for i, img in enumerate(images):
+        # DEBUG: Log images
+        img.save(f"log/debug_image_{i}.png", format="PNG")
+        content.append({
+            "type": "image_url",
+            "image_url": {"url": get_base64_image(img)}
+        })
+    messages = [
+        {
+            "role": "user",
+            "content": content
+        }
+    ]
+    try:
+        outputs = llm.chat(messages=messages, sampling_params=sampling_params)
+        output_text = outputs[0].outputs[0].text
+        # DEBUG: Log raw output text
+        with open("log/debug_vlm_output.txt", "w", encoding="utf-8") as f:
+            f.write(output_text)
+    except Exception as e:
+        print(f"Error during vLLM inference: {e}")
+        return []
+    try:
+        clean_text = output_text.strip()
+        if clean_text.startswith("```json"):
+            clean_text = clean_text[7:]
+        if clean_text.startswith("```"):
+            clean_text = clean_text[3:]
+        if clean_text.endswith("```"):
+            clean_text = clean_text[:-3]
+        clean_text = clean_text.strip()
+        data = json.loads(clean_text)
+        if not isinstance(data, list):
+            data = [data]
+        return data
+    except Exception as e:
+        print(f"Error parsing JSON: {e}\nRaw output: {output_text}")
+        return []
+def numpy_to_base64_audio(wav, sample_rate):
+    wav = wav.squeeze()
+    buffer = io.BytesIO()
+    sf.write(buffer, wav, sample_rate, format='WAV')
+    buffer.seek(0)
+    audio_base64 = base64.b64encode(buffer.read()).decode('utf-8')
+    return f"data:audio/wav;base64,{audio_base64}"
+def process_pdf(pdf_file, translit_lang, translit_format, target_lang):
+    global tts, voice_style
+    # Clean language choices from "Family - Language" to just "Language"
+    if " - " in translit_lang:
+        translit_lang = translit_lang.split(" - ")[-1]
+    if " - " in target_lang:
+        target_lang = target_lang.split(" - ")[-1]
+    os.makedirs("log", exist_ok=True)
+    if pdf_file is None:
+        return "<p>Please upload a PDF.</p>"
+    try:
+        pdf_text, images = extract_pdf_content(pdf_file.name)
+        if not pdf_text.strip() and not images:
+            return "<p>No content found in PDF.</p>"
+    except Exception as e:
+        return f"<p>Error reading PDF: {e}</p>"
+    vocab_list = extract_vocabulary(pdf_text, images, translit_lang, translit_format, target_lang)
+    if not vocab_list:
+        return "<p>Failed to extract vocabulary. The model might not have found Korean text or returned an invalid format.</p>"
+    # Pre-generate TTS audio
+    for i, item in enumerate(vocab_list):
+        korean = item.get("korean", "")
+        # Add dot
+        if not korean.endswith("."):
+            korean += "."
+        try:
+            wav, dur = tts.synthesize(korean, voice_style=voice_style, lang="ko")
+            # DEBUG: Save audio locally
+            wav_1d = wav.squeeze()
+            sf.write(f"log/debug_audio_{i}.wav", wav_1d, tts.sample_rate, format='WAV')
+            audio_data_uri = numpy_to_base64_audio(wav, tts.sample_rate)
+            item['audio_uri'] = audio_data_uri
+        except Exception as e:
+            print(f"TTS error for '{korean}': {e}")
+            item['audio_uri'] = None
+    cards_json = json.dumps(vocab_list).replace("</", "<\\/")
+    iframe_html = f"""
+    <!DOCTYPE html>
+    <html>
+    <head>
+    <!-- Flaticon UIcons CDN -->
+    <link rel='stylesheet' href='https://cdn-uicons.flaticon.com/uicons-regular-rounded/css/uicons-regular-rounded.css'>
+    <style>
+        body {{
+            margin: 0;
+            padding: 0;
+            background: transparent;
+        }}
+        .flashcard-container {{
+            perspective: 1000px;
+            width: 100%;
+            max-width: 500px;
+            margin: 0 auto;
+            font-family: 'Inter', sans-serif;
+            padding-top: 20px;
+        }}
+        .flashcard {{
+            width: 100%;
+            height: 350px;
+            position: relative;
+            transition: transform 0.6s cubic-bezier(0.4, 0.2, 0.2, 1);
+            transform-style: preserve-3d;
+            cursor: pointer;
+        }}
+        .flashcard.is-flipped {{
+            transform: rotateY(180deg);
+        }}
+        .card-face {{
+            position: absolute;
+            width: 100%;
+            height: 100%;
+            backface-visibility: hidden;
+            display: flex;
+            flex-direction: column;
+            justify-content: center;
+            align-items: center;
+            border-radius: 20px;
+            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
+            padding: 30px;
+            box-sizing: border-box;
+            background: rgba(255, 255, 255, 0.8);
+            backdrop-filter: blur(10px);
+            border: 1px solid rgba(255,255,255,0.5);
+            text-align: center;
+        }}
+        .card-front {{
+            background: linear-gradient(135deg, #fdfbfb 0%, #ebedee 100%);
+        }}
+        .card-back {{
+            transform: rotateY(180deg);
+            background: linear-gradient(135deg, #f6d365 0%, #fda085 100%);
+            color: #333;
+        }}
+        .korean-text {{
+            font-size: 54px;
+            font-weight: 700;
+            color: #2c3e50;
+            margin-bottom: 20px;
+        }}
+        .english-text {{
+            font-size: 32px;
+            font-weight: 600;
+            margin-bottom: 5px;
+        }}
+        .translit-text {{
+            font-size: 18px;
+            font-style: italic;
+            color: #d35400;
+            margin-bottom: 15px;
+        }}
+        .explanation-text {{
+            font-size: 16px;
+            color: #555;
+            line-height: 1.5;
+        }}
+        .nav-buttons {{
+            display: flex;
+            justify-content: space-between;
+            margin-top: 30px;
+            width: 100%;
+            max-width: 500px;
+            margin-left: auto;
+            margin-right: auto;
+        }}
+        .nav-btn {{
+            padding: 12px 24px;
+            border: none;
+            border-radius: 12px;
+            background: #7c3aed;
+            color: white;
+            font-weight: 600;
+            cursor: pointer;
+            transition: all 0.2s;
+            box-shadow: 0 4px 12px rgba(124, 58, 237, 0.3);
+            flex: 1;
+            margin: 0 10px;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            gap: 8px;
+        }}
+        .nav-btn:hover {{
+            background: #6d28d9;
+            transform: translateY(-2px);
+        }}
+        .nav-btn:disabled {{
+            background: #ccc;
+            cursor: not-allowed;
+            transform: none;
+            box-shadow: none;
+        }}
+        .audio-btn {{
+            margin-top: 20px;
+            padding: 12px 24px;
+            border-radius: 50px;
+            border: none;
+            background: #2c3e50;
+            color: white;
+            cursor: pointer;
+            font-size: 16px;
+            font-weight: 600;
+            transition: all 0.2s;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            gap: 8px;
+        }}
+        .audio-btn:hover {{
+            background: #34495e;
+            transform: scale(1.05);
+        }}
+        .progress {{
+            text-align: center;
+            margin-top: 15px;
+            color: #666;
+            font-size: 14px;
+            font-weight: 600;
+        }}
+    </style>
+    </head>
+    <body>
+    <div id="flashcard-app">
+        <div class="flashcard-container">
+            <div class="flashcard" id="card" onclick="flipCard()">
+                <div class="card-face card-front">
+                    <div class="korean-text" id="front-text"><i class="fi fi-rr-spinner-third fa-spin"></i> Loading...</div>
+                    <button class="audio-btn" onclick="playAudio(event)" id="audio-btn" style="display:none;"><i class="fi fi-rr-play-circle"></i> Play Audio</button>
+                    <p style="margin-top:20px; color:#999; font-size:13px; display:flex; align-items:center; gap:5px;"><i class="fi fi-rr-rotate-right"></i> Click card to flip 🎯</p>
+                </div>
+                <div class="card-face card-back">
+                    <div class="english-text" id="back-en"></div>
+                    <div class="translit-text" id="back-translit"></div>
+                    <div class="explanation-text"><i class="fi fi-rr-lightbulb-on" style="color:#f1c40f;"></i> <span id="back-exp"></span></div>
+                </div>
+            </div>
+        </div>
+        <div class="nav-buttons">
+            <button class="nav-btn" id="prev-btn" onclick="prevCard()"><i class="fi fi-rr-angle-left"></i> Previous</button>
+            <button class="nav-btn" id="next-btn" onclick="nextCard()">Next <i class="fi fi-rr-angle-right"></i></button>
+        </div>
+        <div class="progress" id="progress-text"></div>
+    </div>
+    <script>
+        const cards = {cards_json};
+        let currentIndex = 0;
+        let audioPlayer = new Audio();
+        function updateCard() {{
+            if (!cards || cards.length === 0) {{
+                document.getElementById('front-text').innerHTML = "No vocabulary found 😥";
+                document.getElementById('prev-btn').disabled = true;
+                document.getElementById('next-btn').disabled = true;
+                return;
+            }}
+            const card = cards[currentIndex];
+            document.getElementById('front-text').innerText = card.korean || "No word";
+            document.getElementById('back-en').innerText = card.translation || card.english || "";
+            document.getElementById('back-translit').innerText = card.transliteration ? `[${{card.transliteration}}]` : "";
+            document.getElementById('back-exp').innerText = card.explanation || "";
+            document.getElementById('prev-btn').disabled = currentIndex === 0;
+            document.getElementById('next-btn').disabled = currentIndex === cards.length - 1;
+            document.getElementById('progress-text').innerHTML = `📚 Card ${{currentIndex + 1}} of ${{cards.length}}`;
+            const cardEl = document.getElementById('card');
+            cardEl.classList.remove('is-flipped');
+            if(card.audio_uri) {{
+                audioPlayer.src = card.audio_uri;
+                document.getElementById('audio-btn').style.display = 'flex';
+            }} else {{
+                document.getElementById('audio-btn').style.display = 'none';
+            }}
+        }}
+        function flipCard() {{
+            if (!cards || cards.length === 0) return;
+            document.getElementById('card').classList.toggle('is-flipped');
+        }}
+        function playAudio(e) {{
+            e.stopPropagation();
+            audioPlayer.play().catch(err => console.log("Audio play error:", err));
+        }}
+        function nextCard() {{
+            if (currentIndex < cards.length - 1) {{
+                currentIndex++;
+                updateCard();
+            }}
+        }}
+        function prevCard() {{
+            if (currentIndex > 0) {{
+                currentIndex--;
+                updateCard();
+            }}
+        }}
+        window.onload = function() {{
+            updateCard();
+        }};
+    </script>
+    </body>
+    </html>
+    """
+    import html
+    safe_srcdoc = html.escape(iframe_html)
+    # Return the iframe containing the whole SPA
+    return f'<iframe srcdoc="{safe_srcdoc}" style="width: 100%; height: 500px; border: none; overflow: hidden;"></iframe>'
+LANGUAGE_DATA = """Indo-European	English, French, Portuguese, German, Romanian, Swedish, Danish, Bulgarian, Russian, Czech, Greek, Ukrainian, Spanish, Dutch, Slovak, Croatian, Polish, Lithuanian, Norwegian Bokmål, Norwegian Nynorsk, Persian, Slovenian, Gujarati, Latvian, Italian, Occitan, Nepali, Marathi, Belarusian, Serbian, Luxembourgish, Venetian, Assamese, Welsh, Silesian, Asturian, Chhattisgarhi, Awadhi, Maithili, Bhojpuri, Sindhi, Irish, Faroese, Hindi, Punjabi, Bengali, Oriya, Tajik, Eastern Yiddish, Lombard, Ligurian, Sicilian, Friulian, Sardinian, Galician, Catalan, Icelandic, Tosk Albanian, Limburgish, Dari, Afrikaans, Macedonian, Sinhala, Urdu, Magahi, Bosnian, Armenian, Latgalian, Scottish Gaelic, Central Kurdish, Northern Kurdish, Southern Pashto, Sanskrit, Dhundari, Marwari, Ahirani, Bagheli, Bagri, Bundeli, Braj, Kumaoni, Kashmiri
+Sino-Tibetan	Chinese (Simplified), Chinese (Traditional), Cantonese, Burmese, Standard Tibetan, Meitei
+Afro-Asiatic	Arabic (Standard), Arabic (Najdi), Arabic (Levantine), Arabic (Egyptian), Arabic (Moroccan), Arabic (Mesopotamian), Arabic (Ta’izzi-Adeni), Arabic (Tunisian), Arabic (Gulf), Arabic (Algerian), Arabic (Sudanese), Arabic (Libyan), Hebrew, Maltese, Amharic, Tigrinya, Kabyle, Somali, West Central Oromo, Hausa
+Austronesian	Indonesian, Malay, Tagalog, Cebuano, Javanese, Sundanese, Minangkabau, Balinese, Banjar, Pangasinan, Iloko, Waray (Philippines), Plateau Malagasy, Malagasy, Buginese, Maori, Samoan, Hawaiian, Fijian
+Dravidian	Tamil, Telugu, Kannada, Malayalam
+Turkic	Turkish, North Azerbaijani, Northern Uzbek, Kazakh, Bashkir, Tatar, Crimean Tatar, Kyrgyz, Turkmen, Uyghur
+Tai-Kadai	Thai, Lao, Shan
+Uralic	Finnish, Estonian, Hungarian, Meadow Mari
+Austroasiatic	Vietnamese, Khmer
+Niger–Congo	Yoruba, Ewe, Kinyarwanda, Lingala, Northern Sotho, Nyanja, Shona, Southern Sotho, Tswana, Xhosa, Zulu, Luganda, Swati, Tsonga, Tumbuka, Venda, Chokwe, Luba-Kasai, Rundi, Umbundu, Kikuyu, Kongo, Nigerian Fulfulde, Wolof, Fon, Kabiyè, Mossi, Akan, Twi, Bambara, Igbo
+Other	Japanese, Korean, Georgian, Basque, Haitian, Papiamento, Kabuverdianu, Tok Pisin, Swahili, Central Aymara, Tulu, Nagamese, Nigerian Pidgin, Mauritian Creole, Sango, Ayacucho Quechua, Halh Mongolian, Southwestern Dinka, Nuer, Guarani"""
+LANGUAGE_CHOICES = []
+for line in LANGUAGE_DATA.strip().split('\n'):
+    family, langs = line.split('\t')
+    for lang in langs.split(', '):
+        LANGUAGE_CHOICES.append(f"{family} - {lang}")
+def create_demo():
+    with gr.Blocks(title="LocalDuo") as demo:
+        gr.Markdown("# 🇰🇷✨ LocalDuo - Learn Korean from PDFs")
+        gr.Markdown("Upload a Korean book 📖 or document PDF 📄. The app uses **vLLM** 🧠 with Qwen3.5-2B to extract vocabulary from text and images, and Supertonic 🗣️ to generate pronunciation audio.")
+        with gr.Row():
+            with gr.Column(scale=1):
+                pdf_input = gr.File(label="Upload Book PDF 📚", file_types=[".pdf"])
+                gr.Markdown("### ⚙️ Customization Settings")
+                translit_lang = gr.Dropdown(
+                    label="Word Transliteration Language",
+                    choices=LANGUAGE_CHOICES,
+                    value="Indo-European - English"
+                )
+                translit_format = gr.Dropdown(label="Transliteration Format", choices=["dashed syllable", "regular word with space"], value="dashed syllable")
+                target_lang = gr.Dropdown(
+                    label="Target Language (Full App)",
+                    choices=LANGUAGE_CHOICES,
+                    value="Indo-European - English"
+                )
+                submit_btn = gr.Button("✨ Generate Flashcards ✨", variant="primary")
+            with gr.Column(scale=2):
+                output_html = gr.HTML(label="Flashcards will appear here")
+        submit_btn.click(
+            fn=process_pdf,
+            inputs=[pdf_input, translit_lang, translit_format, target_lang],
+            outputs=output_html
+        )
+    return demo
+if __name__ == "__main__":
+    print("Loading Qwen3.5-2B model via vLLM...")
+    llm = LLM(
+        model="Qwen/Qwen3.5-2B",
+        # model="Qwen/Qwen3.5-9B",
+        max_model_len=65536,  # Reduced from 262144 to fit on single GPU
+        tensor_parallel_size=1, # Kept at 1 since CUDA_VISIBLE_DEVICES=1
+        gpu_memory_utilization=0.5,
+        enable_prefix_caching=True,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 10} # Added image limits
+    )
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        top_p=0.95,
+        top_k=20,
+        min_p=0.0,
+        presence_penalty=0.0,
+        repetition_penalty=1.0,
+        max_tokens=2048,
+    )
+    print("Loading Supertonic TTS...")
+    tts = TTS(model="supertonic-3")
+    try:
+        voice_style = tts.get_voice_style("F1")
+    except Exception:
+        voice_style = tts.get_voice_style(tts.voice_style_names[0])
+    demo = create_demo()
+    demo.launch(server_name="0.0.0.0", server_port=7861)