Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 6, 2025

Commit

f093b76

1 Parent(s): a7cf970

Add JSON imports for structured response functionality

Browse files

Files changed (1) hide show

gradio_app.py +0 -275

gradio_app.py DELETED Viewed

@@ -1,275 +0,0 @@
-import os
-import logging
-import threading
-from typing import List, Tuple
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import gradio as gr
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Global variables for model
-model = None
-tokenizer = None
-device = None
-model_loaded = False
-def load_model():
-    """Load the Llama model and tokenizer"""
-    global model, tokenizer, device, model_loaded
-    try:
-        logger.info("Starting model loading...")
-        # Check if CUDA is available and force to cuda:0
-        if torch.cuda.is_available():
-            torch.cuda.set_device(0)
-            device = "cuda:0"
-        else:
-            device = "cpu"
-        logger.info(f"Using device: {device}")
-        if device == "cuda:0":
-            logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
-            logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
-        # Get HF token from environment
-        hf_token = os.getenv("HF_TOKEN")
-        logger.info("Loading Llama-3.1-8B-Instruct model...")
-        model_name = "meta-llama/Llama-3.1-8B-Instruct"
-        # Load tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_name,
-            use_fast=True,
-            trust_remote_code=True,
-            token=hf_token
-        )
-        # Load model
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.float16 if device == "cuda:0" else torch.float32,
-            device_map={"": 0},  # Force all parameters to GPU 0
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,
-            use_safetensors=True,
-            token=hf_token
-        )
-        # Ensure model is on the correct device
-        if device == "cuda:0":
-            model = model.to(device)
-        model_loaded = True
-        logger.info("Model loaded successfully!")
-    except Exception as e:
-        logger.error(f"Error loading model: {str(e)}")
-        model_loaded = False
-def chat_response(message: str, history: List[List[str]], temperature: float) -> Tuple[List[List[str]], str]:
-    """Generate a response to the user's message"""
-    global model, tokenizer, device, model_loaded
-    if not model_loaded:
-        history.append([message, "🔄 Model is still loading, please wait..."])
-        return history, ""
-    if not message.strip():
-        return history, ""
-    try:
-        # Create Llama chat prompt
-        conversation = ""
-        for user_msg, assistant_msg in history:
-            if user_msg and assistant_msg:
-                conversation += f"<|start_header_id|>user<|end_header_id|>\n{user_msg}<|eot_id|>"
-                conversation += f"<|start_header_id|>assistant<|end_header_id|>\n{assistant_msg}<|eot_id|>"
-        # Add current message
-        prompt = f"<|begin_of_text|>{conversation}<|start_header_id|>user<|end_header_id|>\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
-        # Tokenize input
-        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
-        # Move to correct device
-        if device == "cuda:0":
-            inputs = {k: v.to(device) for k, v in inputs.items()}
-        # Generate response
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=2048,
-                temperature=temperature,
-                top_p=0.95,
-                do_sample=True,
-                pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id
-            )
-        # Decode response
-        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)        # TEMPORARY: Show complete raw output to debug clipping
-        response = f"""=== RAW MODEL OUTPUT ===
-{generated_text}
-=== END RAW OUTPUT ===
-=== PROMPT USED ===
-{prompt}
-=== END PROMPT ==="""
-        # Add to history
-        # Add to history
-        history.append([message, response])
-    except Exception as e:
-        logger.error(f"Error generating response: {str(e)}")
-        history.append([message, f"❌ Error: {str(e)}"])
-    return history, ""
-def clear_history():
-    """Clear the chat history"""
-    return []
-# Load model in background thread
-def load_model_background():
-    load_model()
-model_thread = threading.Thread(target=load_model_background, daemon=True)
-model_thread.start()
-# Custom CSS for ChatGPT-like appearance
-css = """
-.gradio-container {
-    max-width: 100%; width: 100% !important;
-    margin: 0; padding: 20px !important;
-}
-#chatbot {
-    height: 70vh; min-height: 600px !important;
-    overflow-y: auto !important;
-}
-.message {
-    padding: 12px 16px !important;
-    margin: 5px 0 !important;
-    border-radius: 12px; max-width: 85%; word-wrap: break-word !important;
-}
-.user {
-    background-color: #dcf8c6 !important;
-    margin-left: auto; margin-right: 0 !important;
-}
-.bot {
-    background-color: #f1f1f1 !important;
-    margin-left: 0; margin-right: auto !important;
-}/* Responsive design for larger screens */
-@media (min-width: 1400px) {
-    .gradio-container {
-        padding: 40px !important;
-    }
-    #chatbot {
-        height: 75vh !important;
-    }
-}
-@media (min-width: 1800px) {
-    .gradio-container {
-        padding: 60px !important;
-    }
-    #chatbot {
-        height: 80vh !important;
-    }
-}}
-"""
-# Create Gradio interface
-with gr.Blocks(
-    css=css,
-    title="Llama Chat",
-    theme=gr.themes.Soft()
-) as demo:
-    # Header
-    gr.Markdown(
-        """
-        # 🦙 Llama Chat
-        ### Powered by Llama-3.1-8B-Instruct
-        A clean, ChatGPT-style interface for conversing with the Llama model.
-        """
-    )
-    # Chat interface
-    with gr.Row():
-        with gr.Column(scale=4):
-            chatbot = gr.Chatbot(
-                label="Chat",
-                show_label=False,
-                height=600,
-                show_copy_button=True
-            )
-            with gr.Row():
-                msg = gr.Textbox(
-                    placeholder="Type your message here...",
-                    show_label=False,
-                    scale=4,
-                    lines=1,
-                    max_lines=5
-                )
-                send_btn = gr.Button("Send", variant="primary", scale=1)
-        with gr.Column(scale=1, min_width=250):
-            gr.Markdown("### ⚙️ Settings")
-            temperature = gr.Slider(
-                minimum=0.1,
-                maximum=2.0,
-                value=0.8,
-                step=0.1,
-                label="Temperature",
-                info="Controls creativity (0.1=focused, 2.0=creative)"
-            )
-            clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
-            gr.Markdown(
-                """
-                ### 💡 Tips
-                - Use lower temperature (0.1-0.5) for factual responses
-                - Use higher temperature (1.0-2.0) for creative tasks
-                - Press Enter to send messages
-                - The model maintains conversation context
-                """
-            )
-    # Event handlers
-    def respond(message, history, temp):
-        return chat_response(message, history, temp)
-    # Connect events
-    msg.submit(respond, [msg, chatbot, temperature], [chatbot, msg])
-    send_btn.click(respond, [msg, chatbot, temperature], [chatbot, msg])
-    clear_btn.click(lambda: (clear_history(), ""), outputs=[chatbot, msg])
-    # Footer
-    gr.Markdown(
-        """
-        ---
-        <div style="text-align: center; color: #666; font-size: 0.9em;">
-            🚀 Built with Gradio • 🦙 Powered by Llama-3.1-8B-Instruct
-        </div>
-        """
-    )
-if __name__ == "__main__":
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True
-    )