Studymaker2

Sleeping

App Files Files Community

g0th commited on May 28, 2025

Commit

d405ed8

verified ·

1 Parent(s): 1dbc013

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -48

app.py CHANGED Viewed

@@ -1,93 +1,106 @@
-import gradio as gr
 import os
 import json
-from ppt_parser import transfer_to_structure
 from PIL import Image
 import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText
-# ✅ Hugging Face Token for gated model access
 hf_token = os.getenv("HF_TOKEN")
-# ✅ Load Llama-4-Scout model and processor
-processor = AutoProcessor.from_pretrained("meta-llama/Llama-4-Scout-17B-16E-Instruct", token=hf_token)
-model = AutoModelForImageTextToText.from_pretrained(
-    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    torch_dtype=torch.float16,
     device_map="auto",
-    token=hf_token
 )
-# ✅ Extracted data storage
 extracted_text = ""
-slide_images = []
 def extract_text_from_pptx_json(parsed_json: dict) -> str:
     text = ""
     for slide in parsed_json.values():
         for shape in slide.values():
-            if shape.get('type') == 'group':
-                for group_shape in shape.get('group_content', {}).values():
-                    if group_shape.get('type') == 'text':
                         for para_key, para in group_shape.items():
                             if para_key.startswith("paragraph_"):
                                 text += para.get("text", "") + "\n"
-            elif shape.get('type') == 'text':
                 for para_key, para in shape.items():
                     if para_key.startswith("paragraph_"):
                         text += para.get("text", "") + "\n"
     return text.strip()
-# ✅ Handle uploaded .pptx
 def handle_pptx_upload(pptx_file):
-    global extracted_text, slide_images
     tmp_path = pptx_file.name
     parsed_json_str, image_paths = transfer_to_structure(tmp_path, "images")
     parsed_json = json.loads(parsed_json_str)
     extracted_text = extract_text_from_pptx_json(parsed_json)
-    slide_images = image_paths
     return extracted_text or "No readable text found in slides."
-# ✅ Ask a question using Llama 4 Scout
 def ask_llama(question):
-    global extracted_text, slide_images
-    if not extracted_text and not slide_images:
-        return "Please upload a PPTX file first."
-    inputs = {
-        "role": "user",
-        "content": []
-    }
-    # Add first image only (multimodal models may limit batch input size)
-    if slide_images:
-        image = Image.open(slide_images[0])
-        inputs["content"].append({"type": "image", "image": image})
-    # Add contextual text + question
-    context = f"{extracted_text}\n\nQuestion: {question}"
-    inputs["content"].append({"type": "text", "text": context})
-    outputs = processor(text=[inputs], return_tensors="pt").to(model.device)
-    with torch.no_grad():
-        generated_ids = model.generate(**outputs, max_new_tokens=512)
-    result = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return result
 # ✅ Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("## 🧠 Llama 4 Scout: PPTX-Based Multimodal Study Assistant")
     pptx_input = gr.File(label="📂 Upload PPTX File", file_types=[".pptx"])
-    extract_btn = gr.Button("📜 Extract Text + Slides")
-    extracted_output = gr.Textbox(label="📄 Extracted Text", lines=10, interactive=False)
     extract_btn.click(handle_pptx_upload, inputs=[pptx_input], outputs=[extracted_output])
     question = gr.Textbox(label="❓ Ask a Question")
     ask_btn = gr.Button("💬 Ask Llama 4 Scout")
-    ai_answer = gr.Textbox(label="🤖 Llama Answer", lines=4)
     ask_btn.click(ask_llama, inputs=[question], outputs=[ai_answer])

 import os
 import json
+import requests
 from PIL import Image
 import torch
+import gradio as gr
+from ppt_parser import transfer_to_structure
+from transformers import AutoProcessor, Llama4ForConditionalGeneration
+# ✅ Hugging Face token
 hf_token = os.getenv("HF_TOKEN")
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+# ✅ Load model & processor
+processor = AutoProcessor.from_pretrained(model_id, token=hf_token)
+model = Llama4ForConditionalGeneration.from_pretrained(
+    model_id,
+    token=hf_token,
+    attn_implementation="flex_attention",
     device_map="auto",
+    torch_dtype=torch.bfloat16,
 )
+# ✅ Global storage
 extracted_text = ""
+image_paths = []
 def extract_text_from_pptx_json(parsed_json: dict) -> str:
     text = ""
     for slide in parsed_json.values():
         for shape in slide.values():
+            if shape.get("type") == "group":
+                for group_shape in shape.get("group_content", {}).values():
+                    if group_shape.get("type") == "text":
                         for para_key, para in group_shape.items():
                             if para_key.startswith("paragraph_"):
                                 text += para.get("text", "") + "\n"
+            elif shape.get("type") == "text":
                 for para_key, para in shape.items():
                     if para_key.startswith("paragraph_"):
                         text += para.get("text", "") + "\n"
     return text.strip()
+# ✅ Handle uploaded PPTX
 def handle_pptx_upload(pptx_file):
+    global extracted_text, image_paths
     tmp_path = pptx_file.name
     parsed_json_str, image_paths = transfer_to_structure(tmp_path, "images")
     parsed_json = json.loads(parsed_json_str)
     extracted_text = extract_text_from_pptx_json(parsed_json)
     return extracted_text or "No readable text found in slides."
+# ✅ Multimodal Q&A using Scout
 def ask_llama(question):
+    global extracted_text, image_paths
+    if not extracted_text and not image_paths:
+        return "Please upload and extract a PPTX first."
+    # 🧠 Build multimodal chat messages
+    messages = [
+        {
+            "role": "user",
+            "content": [],
+        }
+    ]
+    # Add up to 2 images to prevent OOM
+    for path in image_paths[:2]:
+        messages[0]["content"].append({"type": "image", "image": Image.open(path)})
+    messages[0]["content"].append({
+        "type": "text",
+        "text": f"{extracted_text}\n\nQuestion: {question}"
+    })
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=256)
+    response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0]
+    return response.strip()
 # ✅ Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("## 🧠 Multimodal Llama 4 Scout Study Assistant")
     pptx_input = gr.File(label="📂 Upload PPTX File", file_types=[".pptx"])
+    extract_btn = gr.Button("📜 Extract Text + Images")
+    extracted_output = gr.Textbox(label="📄 Slide Text", lines=10, interactive=False)
     extract_btn.click(handle_pptx_upload, inputs=[pptx_input], outputs=[extracted_output])
     question = gr.Textbox(label="❓ Ask a Question")
     ask_btn = gr.Button("💬 Ask Llama 4 Scout")
+    ai_answer = gr.Textbox(label="🤖 Answer", lines=6)
     ask_btn.click(ask_llama, inputs=[question], outputs=[ai_answer])