Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 12, 2025

Commit

6bf8feb

1 Parent(s): 19607d6

Simplify API - remove all templates, just prompt-in response-out

Browse files

Files changed (1) hide show

gradio_app.py +88 -416

gradio_app.py CHANGED Viewed

@@ -1,12 +1,5 @@
 import os
 import logging
-import time
-import asyncio
-from typing import List, Optional, Dict, Any
-import threading
-import json
-import re
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
@@ -15,12 +8,6 @@ import gradio as gr
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Global variables for model and tokenizer
-model = None
-tokenizer = None
-device = None
-model_loaded = False
 class ModelManager:
     def __init__(self):
         self.model = None
@@ -34,7 +21,7 @@ class ModelManager:
         try:
             logger.info("Starting model loading...")
-            # Check if CUDA is available and force to cuda:0
             if torch.cuda.is_available():
                 torch.cuda.set_device(0)
                 self.device = "cuda:0"
@@ -62,7 +49,7 @@ class ModelManager:
             self.model = AutoModelForCausalLM.from_pretrained(
                 base_model_name,
                 torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
-                device_map={"": 0},  # Force all parameters to GPU 0
                 trust_remote_code=True,
                 low_cpu_mem_usage=True,
                 use_safetensors=True,
@@ -79,447 +66,132 @@ class ModelManager:
             logger.error(f"Error loading model: {str(e)}")
             self.model_loaded = False
-# Start model loading in a separate thread
 model_manager = ModelManager()
-def create_json_prompt(message, template_type):
-    """Create JSON-formatted prompts based on template type"""
-    json_templates = {
-        "general": {
-            "instruction": "Extract the key points from the content and return them as a JSON array of strings. Each string should be a concise summary of an important point from the content.",
-            "schema": """Format: ["actual key point from content", "another key point from content", "etc..."]"""
-        },
-        "list": {
-            "instruction": "Extract and list the key topics or points from the content. Return them as a JSON array where each element is a specific, factual point from the content. Do not use placeholder text.",
-            "schema": """Return a JSON array of strings, each representing a distinct point from the content. Example format: ["First specific point from the content", "Second specific point", "Third point"]"""
-        },
-        "questions": {
-            "instruction": "Generate 3 diverse user and assistant prompt pairs based on the specific topic provided. Create realistic questions a user might ask and helpful assistant responses.",
-            "schema": """Format: [{"user": "realistic question about the topic", "assistant": "helpful response"}, {"user": "different question", "assistant": "different response"}, {"user": "third question", "assistant": "third response"}]"""
-        },
-        "analysis": {
-            "instruction": "Analyze the following content and respond in JSON format:",
-            "schema": """{
-  "summary": "brief summary of the content",
-  "key_points": [
-    "Key point 1",
-    "Key point 2",
-    "Key point 3"
-  ],
-  "sentiment": "positive|negative|neutral",
-  "topics": ["topic1", "topic2", "topic3"],
-  "complexity_score": 0.75,
-  "word_count": 150
-}"""
-        },
-        "structured": {
-            "instruction": "Process this information and respond in a structured JSON format:",
-            "schema": """{
-  "title": "extracted or generated title",
-  "content": "processed content",
-  "categories": ["category1", "category2"],
-  "tags": ["tag1", "tag2", "tag3"],
-  "priority": "high|medium|low",
-  "action_items": [
-    "Action item 1",
-    "Action item 2"
-  ]
-}"""
-        }
-    }
-    template = json_templates.get(template_type, json_templates["general"])
-    return f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
-{message}
-{template["instruction"]}
-{template["schema"]}
-Important: Respond with valid JSON only. No additional text. Base your response on the actual content provided, not the format examples.
-<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-"""
-def prettify_json_response(response_text):
-    """Try to extract and prettify JSON from response"""
-    try:
-        # Clean the response first
-        cleaned = response_text.strip()
-        # Try to parse the entire response as JSON first
-        try:
-            parsed_json = json.loads(cleaned)
-            return json.dumps(parsed_json, indent=2, ensure_ascii=False)
-        except json.JSONDecodeError:
-            pass
-        # Try to find JSON in the response - look for both objects and arrays
-        # Use non-greedy matching and better patterns
-        json_patterns = [
-            r'\[[\s\S]*?\](?=\s*$)',  # Array pattern - non-greedy, end of string
-            r'\{[\s\S]*?\}(?=\s*$)',  # Object pattern - non-greedy, end of string
-            r'\[[\s\S]*\]',          # Array pattern - greedy fallback
-            r'\{[\s\S]*\}'           # Object pattern - greedy fallback
-        ]
-        for pattern in json_patterns:
-            json_match = re.search(pattern, cleaned, re.MULTILINE)
-            if json_match:
-                json_str = json_match.group().strip()
-                try:
-                    parsed_json = json.loads(json_str)
-                    return json.dumps(parsed_json, indent=2, ensure_ascii=False)
-                except json.JSONDecodeError:
-                    continue
-        # If no JSON found, return original
-        return response_text
-    except AttributeError:
-        return response_text
-def chat_with_model(message, history, temperature, json_mode=False, json_template="general"):
-    """Raw chat function for direct model interaction"""
-    if not message.strip():
-        return history, ""
     if not model_manager.model_loaded:
-        response = "Model not loaded yet. Please wait..."
-        history.append({"role": "user", "content": message})
-        history.append({"role": "assistant", "content": response})
-        return history, ""
     try:
-        # Create prompt based on mode
-        if json_mode:
-            prompt = create_json_prompt(message, json_template)
-        else:
-            # Create a simple chat prompt
-            prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
-{message}
 <|eot_id|><|start_header_id|>assistant<|end_header_id|>
 """
-        # Generate response using the model directly
-        inputs = model_manager.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
-        # Force all inputs to the same device as the model
         if model_manager.device == "cuda:0":
-            # Get the actual device of the model
             model_device = next(model_manager.model.parameters()).device
-            logger.info(f"Model is on device: {model_device}")
-            # Move all input tensors to the same device as the model
             inputs = {k: v.to(model_device) for k, v in inputs.items()}
         with torch.no_grad():
-                    outputs = model_manager.model.generate(
-            **inputs,
-            max_new_tokens=8192,  # Much higher limit for complete responses
-            temperature=temperature,
-            top_p=0.95,
-            do_sample=True,
-            num_beams=1,
-            pad_token_id=model_manager.tokenizer.eos_token_id,
-            eos_token_id=model_manager.tokenizer.eos_token_id,
-            early_stopping=False,  # Disable early stopping
-            repetition_penalty=1.05,  # Lighter repetition penalty
-            no_repeat_ngram_size=0,  # Disable n-gram repetition blocking
-            length_penalty=1.0,  # Neutral length penalty
-            min_new_tokens=50  # Ensure minimum response length
-        )
-        # Decode response
         generated_text = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Debug logging
-        logger.info(f"Full generated text length: {len(generated_text)} characters")
-        logger.info(f"Generated text preview: {generated_text[:300]}...")
-        logger.info(f"Generated text ending: ...{generated_text[-300:]}")
-        # Extract the response part (remove the prompt)
         if "<|start_header_id|>assistant<|end_header_id|>" in generated_text:
             response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
         else:
-            # Improved fallback: look for common JSON starting patterns
-            response = generated_text
-            # Try to find where the actual response starts
-            json_start_patterns = ['[', '{', '"']
-            for pattern in json_start_patterns:
-                if pattern in generated_text:
-                    # Find the first occurrence that looks like the start of JSON
-                    start_idx = generated_text.find(pattern)
-                    if start_idx > len(prompt) // 2:  # Make sure it's after the prompt
-                        response = generated_text[start_idx:].strip()
-                        break
-            # Ultimate fallback: use the last portion of the text
-            if response == generated_text:
-                # Split by common delimiters and take the largest chunk
-                chunks = generated_text.split('\n\n')
-                if len(chunks) > 1:
-                    response = chunks[-1].strip()
-                else:
-                    response = generated_text[len(prompt)//2:].strip()
-        # Log response length for debugging
         logger.info(f"Generated response length: {len(response)} characters")
-        logger.info(f"Response preview: {response[:200]}...")
-        # Process JSON response if in JSON mode
-        if json_mode and response:
-            original_response = response
-            response = prettify_json_response(response)
-            if response != original_response:
-                logger.info(f"JSON processing applied. New length: {len(response)}")
-            else:
-                logger.info("JSON processing had no effect - no valid JSON found")
-        # Add to history
-        history.append({"role": "user", "content": message})
-        history.append({"role": "assistant", "content": response})
     except Exception as e:
-        logger.error(f"Error in chat: {str(e)}")
-        history.append({"role": "user", "content": message})
-        history.append({"role": "assistant", "content": f"Error: {str(e)}"})
     return history, ""
-def clear_chat():
-    """Clear the chat history"""
-    return [], ""
-# Custom CSS for full-width ChatGPT-like appearance
-css = """
-.gradio-container {
-    max-width: 100% !important;
-    width: 100% !important;
-    margin: 0 !important;
-    padding: 20px !important;
-}
-#chatbot {
-    height: 400px !important;
-    max-height: 400px !important;
-    min-height: 400px !important;
-    overflow-y: auto !important;
-    border-radius: 12px !important;
-    border: 1px solid #e0e0e0 !important;
-    background-color: #fafafa !important;
-    color: #212529 !important;
-}
-/* Force all text in chatbot to be dark - nuclear option */
-#chatbot,
-#chatbot *,
-[data-testid="chatbot"],
-[data-testid="chatbot"] *,
-.chatbot,
-.chatbot *,
-.gr-chatbot,
-.gr-chatbot * {
-    color: #212529 !important;
-    text-shadow: none !important;
-}
-/* Ensure all chatbot text has proper contrast - More specific targeting */
-#chatbot .message,
-#chatbot .bot-message,
-#chatbot .user-message,
-#chatbot div,
-#chatbot p,
-#chatbot span,
-#chatbot .prose,
-#chatbot .markdown,
-.chatbot .message-content,
-.gradio-chatbot .message,
-.gradio-chatbot div,
-.gradio-chatbot p,
-.gradio-chatbot span {
-    color: #212529 !important;
-}
-/* Target Gradio's specific chatbot classes */
-.chatbot .bot,
-.chatbot .user,
-.gradio-chatbot,
-.gradio-chatbot * {
-    color: #212529 !important;
-}
-.message {
-    padding: 12px 16px !important;
-    margin: 8px 0 !important;
-    border-radius: 12px !important;
-    max-width: 85% !important;
-    word-wrap: break-word !important;
-}
-.user {
-    background-color: #007bff !important;
-    color: white !important;
-    margin-left: auto !important;
-    margin-right: 0 !important;
-}
-.bot {
-    background-color: #f8f9fa !important;
-    border: 1px solid #e9ecef !important;
-    margin-left: 0 !important;
-    margin-right: auto !important;
-    color: #212529 !important;
-}
-/* Full width input area */
-.gr-textbox {
-    border-radius: 8px !important;
-}
-/* Prevent textbox from affecting layout */
-.gradio-textbox textarea {
-    resize: none !important;
-    max-height: 120px !important;
-    min-height: 40px !important;
-}
-/* Prevent layout shifts on focus */
-.gradio-container .wrap {
-    min-height: auto !important;
-}
-/* Stable row heights */
-.gradio-row {
-    min-height: auto !important;
-}
-/* Responsive design for different screen sizes */
-@media (min-width: 1400px) {
-    .gradio-container {
-        padding: 40px !important;
-    }
-    #chatbot {
-        height: 450px !important;
-        max-height: 450px !important;
-    }
-}
-@media (min-width: 1800px) {
-    .gradio-container {
-        padding: 60px !important;
-    }
-    #chatbot {
-        height: 500px !important;
-        max-height: 500px !important;
-    }
-}
-"""
-# Create simplified chat interface with JSON functionality
-with gr.Blocks(css=css, title="Llama Chat", theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # 🦙 Llama Chat
-        ### Raw interface for Llama-3.1-8B-Instruct
-        Direct chat interface for testing prompts and having conversations with the model.
-        **New:** Enable **JSON Response Mode** for structured outputs! Choose from templates like:
-        - 🎯 **General**: Basic structured responses
-        - ❓ **Questions**: Generate question sets from content
-        - 📊 **Analysis**: Content analysis with sentiment & topics
-        - 📋 **Structured**: Organized data with categories & actions
-        """
-    )
-    # Simple chat interface
-    chatbot = gr.Chatbot(
-        elem_id="chatbot",
-        label="Chat",
-        show_label=False,
-        avatar_images=(None, None),
-        show_share_button=False,
-        type="messages",  # Use new message format
-        height=400,  # Reduced from 600 to 400
-        render_markdown=True,
-        show_copy_button=True,
-        container=True,
-        scale=1
-    )
     with gr.Row():
         with gr.Column(scale=4):
             msg = gr.Textbox(
-                placeholder="Type your message here...",
-                show_label=False,
-                container=False,
-                lines=1,
-                max_lines=3,
-                autofocus=False,
-                interactive=True
             )
         with gr.Column(scale=1):
-            submit_btn = gr.Button("Send", variant="primary")
-        with gr.Column(scale=1):
-            clear_btn = gr.Button("Clear", variant="secondary")
-    with gr.Row():
-        temperature = gr.Slider(
-            minimum=0.1,
-            maximum=2.0,
-            value=0.8,
-            step=0.1,
-            label="Temperature",
-            info="Controls randomness (0.1=focused, 2.0=creative)"
-        )
-    with gr.Row():
-        with gr.Column(scale=2):
-            json_mode = gr.Checkbox(
-                label="JSON Response Mode",
-                value=False,
-                info="Get structured JSON responses instead of regular text"
-            )
-        with gr.Column(scale=3):
-            json_template = gr.Dropdown(
-                choices=["general", "questions", "analysis", "structured"],
-                value="general",
-                label="JSON Template",
-                info="Choose the type of JSON structure you want",
-                visible=False
             )
-    # Event handlers
-    def respond(message, history, temp, json_enabled, json_type):
-        return chat_with_model(message, history, temp, json_enabled, json_type)
-    def toggle_json_template(json_enabled):
-        return gr.update(visible=json_enabled)
-    # Connect JSON mode toggle to template visibility
-    json_mode.change(toggle_json_template, inputs=[json_mode], outputs=[json_template])
-    msg.submit(respond, [msg, chatbot, temperature, json_mode, json_template], [chatbot, msg])
-    submit_btn.click(respond, [msg, chatbot, temperature, json_mode, json_template], [chatbot, msg])
-    clear_btn.click(clear_chat, outputs=[chatbot, msg])
-    # Add footer
-    gr.Markdown(
-        """
-        ---
-        <div style="text-align: center; color: #666; font-size: 0.9em;">
-            Built with ❤️ using Gradio and Llama-3.1-8B-Instruct •
-            <a href="/docs" target="_blank">API Documentation</a> •
-            JSON Mode for structured outputs
-        </div>
-        """
-    )
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=False,
-        show_error=True
     )

 import os
 import logging
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class ModelManager:
     def __init__(self):
         self.model = None
         try:
             logger.info("Starting model loading...")
+            # Check if CUDA is available
             if torch.cuda.is_available():
                 torch.cuda.set_device(0)
                 self.device = "cuda:0"
             self.model = AutoModelForCausalLM.from_pretrained(
                 base_model_name,
                 torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
+                device_map={"": 0} if self.device == "cuda:0" else None,
                 trust_remote_code=True,
                 low_cpu_mem_usage=True,
                 use_safetensors=True,
             logger.error(f"Error loading model: {str(e)}")
             self.model_loaded = False
+# Initialize model manager
 model_manager = ModelManager()
+def generate_response(prompt, temperature=0.8):
+    """Simple function to generate a response from a prompt"""
     if not model_manager.model_loaded:
+        return "Model not loaded yet. Please wait..."
     try:
+        # Create the Llama-3.1 chat format
+        formatted_prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
+{prompt}
 <|eot_id|><|start_header_id|>assistant<|end_header_id|>
 """
+        # Tokenize the input
+        inputs = model_manager.tokenizer(
+            formatted_prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=4096
+        )
+        # Move inputs to the same device as the model
         if model_manager.device == "cuda:0":
             model_device = next(model_manager.model.parameters()).device
             inputs = {k: v.to(model_device) for k, v in inputs.items()}
+        # Generate response
         with torch.no_grad():
+            outputs = model_manager.model.generate(
+                **inputs,
+                max_new_tokens=8192,
+                temperature=temperature,
+                top_p=0.95,
+                do_sample=True,
+                num_beams=1,
+                pad_token_id=model_manager.tokenizer.eos_token_id,
+                eos_token_id=model_manager.tokenizer.eos_token_id,
+                early_stopping=False,
+                repetition_penalty=1.05,
+                no_repeat_ngram_size=0,
+                length_penalty=1.0,
+                min_new_tokens=50
+            )
+        # Decode the response
         generated_text = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract just the assistant's response
         if "<|start_header_id|>assistant<|end_header_id|>" in generated_text:
             response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
         else:
+            # Fallback: remove the prompt from the beginning
+            response = generated_text[len(formatted_prompt):].strip()
         logger.info(f"Generated response length: {len(response)} characters")
+        return response
     except Exception as e:
+        logger.error(f"Error generating response: {str(e)}")
+        return f"Error: {str(e)}"
+def respond(message, history, temperature):
+    """Gradio interface function for chat"""
+    response = generate_response(message, temperature)
+    # Update history
+    history.append({"role": "user", "content": message})
+    history.append({"role": "assistant", "content": response})
     return history, ""
+# Create the Gradio interface
+with gr.Blocks(title="Question Generation API") as demo:
+    gr.Markdown("# Simple LLM API")
+    gr.Markdown("Send a prompt and get a response. No templates, just direct model interaction.")
     with gr.Row():
         with gr.Column(scale=4):
+            chatbot = gr.Chatbot(
+                label="Chat",
+                type="messages",
+                height=400
+            )
             msg = gr.Textbox(
+                label="Message",
+                placeholder="Enter your prompt here...",
+                lines=3
             )
+            with gr.Row():
+                submit = gr.Button("Send", variant="primary")
+                clear = gr.Button("Clear")
         with gr.Column(scale=1):
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=2.0,
+                value=0.8,
+                step=0.1,
+                label="Temperature",
+                info="Higher = more creative"
             )
+            gr.Markdown("""
+            ### API Usage
+            This model accepts any prompt and returns a response.
+            For JSON responses, include instructions in your prompt like:
+            - "Return as a JSON array"
+            - "Format as JSON"
+            - "List as JSON"
+            The model will follow your instructions.
+            """)
+    # Set up event handlers
+    submit.click(respond, [msg, chatbot, temperature], [chatbot, msg])
+    msg.submit(respond, [msg, chatbot, temperature], [chatbot, msg])
+    clear.click(lambda: ([], ""), outputs=[chatbot, msg])
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False
     )