Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 6, 2025

Commit

f364fe3

1 Parent(s): f093b76

Add complete JSON functionality to Gradio interface

Browse files

Files changed (1) hide show

gradio_app.py +301 -0

gradio_app.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import os
+import logging
+import threading
+import json
+import re
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import gradio as gr
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global variables for model
+model = None
+tokenizer = None
+device = None
+model_loaded = False
+def load_model():
+    """Load the model and tokenizer"""
+    global model, tokenizer, device, model_loaded
+    try:
+        logger.info("Starting model loading...")
+        if torch.cuda.is_available():
+            torch.cuda.set_device(0)
+            device = "cuda:0"
+        else:
+            device = "cpu"
+        logger.info(f"Using device: {device}")
+        if device == "cuda:0":
+            logger.info(f"GPU: {torch.cuda.get_device_name()}")
+            logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
+        hf_token = os.getenv("HF_TOKEN")
+        logger.info("Loading Llama-3.1-8B-Instruct model...")
+        base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
+        tokenizer = AutoTokenizer.from_pretrained(
+            base_model_name,
+            use_fast=True,
+            trust_remote_code=True,
+            token=hf_token
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model_name,
+            torch_dtype=torch.float16 if device == "cuda:0" else torch.float32,
+            device_map={"": 0},
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+            token=hf_token
+        )
+        if device == "cuda:0":
+            model = model.to(device)
+        model_loaded = True
+        logger.info("Model loaded successfully!")
+    except Exception as e:
+        logger.error(f"Error loading model: {str(e)}")
+        model_loaded = False
+# Start model loading in a separate thread
+model_thread = threading.Thread(target=load_model)
+model_thread.start()
+def create_json_prompt(message, template_type):
+    """Create JSON-formatted prompts based on template type"""
+    json_templates = {
+        "general": {
+            "instruction": "Please respond in valid JSON format with the following structure:",
+            "schema": """{
+  "response": "your main response here",
+  "type": "answer|question|explanation|analysis",
+  "confidence": 0.95,
+  "metadata": {
+    "topic": "detected topic",
+    "complexity": "simple|moderate|complex"
+  }
+}"""
+        },
+        "questions": {
+            "instruction": "Generate 5 thoughtful questions based on the following statement. Respond in JSON format:",
+            "schema": """{
+  "questions": [
+    "Question 1 here?",
+    "Question 2 here?",
+    "Question 3 here?",
+    "Question 4 here?",
+    "Question 5 here?"
+  ],
+  "statement": "original statement",
+  "difficulty": "mixed",
+  "total_questions": 5,
+  "metadata": {
+    "topic": "detected topic",
+    "question_types": ["factual", "analytical", "creative"]
+  }
+}"""
+        }
+    }
+    template = json_templates.get(template_type, json_templates["general"])
+    return f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
+{message}
+{template["instruction"]}
+{template["schema"]}
+Ensure the response is valid JSON that can be parsed. Do not include any text outside the JSON structure.
+<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+"""
+def prettify_json_response(response_text):
+    """Try to extract and prettify JSON from response"""
+    try:
+        json_pattern = r'\{.*\}'
+        json_match = re.search(json_pattern, response_text, re.DOTALL)
+        if json_match:
+            json_str = json_match.group()
+            parsed_json = json.loads(json_str)
+            return json.dumps(parsed_json, indent=2, ensure_ascii=False)
+        else:
+            return response_text
+    except (json.JSONDecodeError, AttributeError):
+        return response_text
+def chat_with_model(message, history, temperature, json_mode=False, json_template="general"):
+    """Chat function for model interaction"""
+    if not message.strip():
+        return history, ""
+    if not model_loaded:
+        response = "Model not loaded yet. Please wait..."
+        history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": response})
+        return history, ""
+    try:
+        if json_mode:
+            prompt = create_json_prompt(message, json_template)
+        else:
+            prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
+{message}
+<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+"""
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
+        if device == "cuda:0":
+            model_device = next(model.parameters()).device
+            inputs = {k: v.to(model_device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=4096,
+                temperature=temperature,
+                top_p=0.95,
+                do_sample=True,
+                num_beams=1,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                early_stopping=False,
+                repetition_penalty=1.1
+            )
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        if "<|start_header_id|>assistant<|end_header_id|>" in generated_text:
+            response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
+        else:
+            response = generated_text[len(prompt):].strip()
+        if json_mode and response:
+            response = prettify_json_response(response)
+        history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": response})
+    except Exception as e:
+        logger.error(f"Error in chat: {str(e)}")
+        history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": f"Error: {str(e)}"})
+    return history, ""
+def clear_chat():
+    return [], ""
+css = """
+.gradio-container {
+    max-width: 100% !important;
+    width: 100% !important;
+    margin: 0 !important;
+    padding: 20px !important;
+}
+#chatbot {
+    height: 70vh !important;
+    min-height: 600px !important;
+    overflow-y: auto !important;
+}
+"""
+with gr.Blocks(css=css, title="Llama Chat", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🦙 Llama Chat
+        ### Raw interface for Llama-3.1-8B-Instruct with JSON Mode
+        **JSON Response Mode**: Enable for structured outputs!
+        - 🎯 **General**: Basic structured responses
+        - ❓ **Questions**: Generate question sets from content
+        """
+    )
+    chatbot = gr.Chatbot(
+        elem_id="chatbot",
+        label="Chat",
+        show_label=False,
+        avatar_images=(None, None),
+        show_share_button=False,
+        type="messages",
+        height=600,
+        render_markdown=True,
+        show_copy_button=True
+    )
+    with gr.Row():
+        with gr.Column(scale=4):
+            msg = gr.Textbox(
+                placeholder="Type your message here...",
+                show_label=False,
+                container=False
+            )
+        with gr.Column(scale=1):
+            submit_btn = gr.Button("Send", variant="primary")
+        with gr.Column(scale=1):
+            clear_btn = gr.Button("Clear", variant="secondary")
+    with gr.Row():
+        temperature = gr.Slider(
+            minimum=0.1,
+            maximum=2.0,
+            value=0.8,
+            step=0.1,
+            label="Temperature"
+        )
+    with gr.Row():
+        with gr.Column(scale=2):
+            json_mode = gr.Checkbox(
+                label="JSON Response Mode",
+                value=False,
+                info="Get structured JSON responses"
+            )
+        with gr.Column(scale=3):
+            json_template = gr.Dropdown(
+                choices=["general", "questions"],
+                value="general",
+                label="JSON Template",
+                visible=False
+            )
+    def respond(message, history, temp, json_enabled, json_type):
+        return chat_with_model(message, history, temp, json_enabled, json_type)
+    def toggle_json_template(json_enabled):
+        return gr.update(visible=json_enabled)
+    json_mode.change(toggle_json_template, inputs=[json_mode], outputs=[json_template])
+    msg.submit(respond, [msg, chatbot, temperature, json_mode, json_template], [chatbot, msg])
+    submit_btn.click(respond, [msg, chatbot, temperature, json_mode, json_template], [chatbot, msg])
+    clear_btn.click(clear_chat, outputs=[chatbot, msg])
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )