import sys import os import json import requests from fastapi import FastAPI, HTTPException from pydantic import BaseModel from typing import List, Optional, Dict, Any import uvicorn import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM import logging # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # FastAPI app for hosting the Gradio interface app = FastAPI(title="DevOps SLM Interface") # Your Hugging Face endpoint details HF_ENDPOINT_URL = "https://bcg2lrpnfylqamcz.us-east-1.aws.endpoints.huggingface.cloud" HF_API_TOKEN = os.getenv("HF_API_TOKEN") # Must be set in Space environment variables # User-friendly API base URL API_BASE_URL = "https://lakhera2023-devops-slm-chat.hf.space" # Pydantic models for OpenAI API class CompletionRequest(BaseModel): prompt: str max_tokens: Optional[int] = 100 temperature: Optional[float] = 0.7 top_p: Optional[float] = 1.0 frequency_penalty: Optional[float] = 0.0 presence_penalty: Optional[float] = 0.0 stop: Optional[List[str]] = None stream: Optional[bool] = False class ChatMessage(BaseModel): role: str content: str class ChatCompletionRequest(BaseModel): messages: List[ChatMessage] max_tokens: Optional[int] = 100 temperature: Optional[float] = 0.7 top_p: Optional[float] = 1.0 frequency_penalty: Optional[float] = 0.0 presence_penalty: Optional[float] = 0.0 stop: Optional[List[str]] = None stream: Optional[bool] = False # DevOps SLM Inference Class (from your existing code) class DevOpsSLMInference: def __init__(self): self.model = None self.tokenizer = None self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model_name = "lakhera2023/devops-slm" def load_model(self): """Load the DevOps SLM model""" try: logger.info(f"Loading model: {self.model_name}") logger.info(f"Using device: {self.device}") # Load tokenizer self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) # Load model self.model = AutoModelForCausalLM.from_pretrained( self.model_name, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, device_map="auto" if self.device == "cuda" else None, low_cpu_mem_usage=True, trust_remote_code=True ) # Set pad token if not present if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token logger.info("Model loaded successfully!") return True except Exception as e: logger.error(f"Error loading model: {e}") return False def generate_response(self, prompt, max_tokens=200, temperature=0.7, top_p=0.9, top_k=50): """Generate a response using the DevOps SLM""" if self.model is None or self.tokenizer is None: if not self.load_model(): return "Error: Could not load model" try: # Tokenize input inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) inputs = {k: v.to(self.device) for k, v in inputs.items()} # Generate response with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, top_k=top_k, do_sample=True, num_return_sequences=1, pad_token_id=self.tokenizer.eos_token_id, eos_token_id=self.tokenizer.eos_token_id, repetition_penalty=1.1, no_repeat_ngram_size=2 ) # Decode response response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) # Remove the input prompt from the response if response.startswith(prompt): response = response[len(prompt):].strip() # Clean up template artifacts response = response.replace("<|im_start|>", "").replace("<|im_end|>", "").strip() return response except Exception as e: logger.error(f"Error generating response: {e}") return f"Error: {str(e)}" # Initialize the inference class devops_slm = DevOpsSLMInference() # Utility functions for OpenAI API def count_tokens(text: str) -> int: """Simple token counting - you might want to use a proper tokenizer""" return len(text.split()) * 1.3 # Rough approximation def call_hf_endpoint(prompt: str, max_tokens: int = 100) -> Dict[str, Any]: """Call the Hugging Face endpoint""" headers = { "Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json" } data = { "inputs": prompt, "parameters": { "max_new_tokens": max_tokens, "temperature": 0.7, "do_sample": True, "return_full_text": False } } try: response = requests.post(HF_ENDPOINT_URL, headers=headers, json=data, timeout=60) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: raise HTTPException(status_code=500, detail=f"Hugging Face API error: {str(e)}") # Simple health check endpoint @app.get("/health") async def health_check(): """Health check endpoint""" return {"status": "healthy", "model": "devops-slm", "interface": "gradio"} @app.get("/api") async def api_info(): """API information endpoint""" return { "message": "DevOps SLM - Use Inference Endpoint for API Access", "inference_endpoint": "https://bcg2lrpnfylqamcz.us-east-1.aws.endpoints.huggingface.cloud", "ui_url": "https://huggingface.co/spaces/lakhera2023/devops-slm-chat/ui", "note": "For API access, use the Hugging Face Inference Endpoint. See README.md for examples." } # Gradio interface (from your existing code) example_prompts = [ "How do I deploy a microservice to Kubernetes?", "What are the best practices for container security?", "How can I monitor application performance in production?", "Explain the difference between Docker and Kubernetes", "What is CI/CD and how do I implement it?", "Create a Kubernetes deployment YAML for a web application", "How do I set up a Docker multi-stage build?", "What are the key components of a DevOps pipeline?" ] def create_gradio_interface(): """Create the Gradio interface""" with gr.Blocks( title="DevOps SLM - Specialized Language Model", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px !important; } """ ) as demo: gr.Markdown(""" # 🚀 DevOps Specialized Language Model A specialized AI model trained for DevOps tasks, Kubernetes operations, Docker containerization, CI/CD pipelines, and infrastructure management. **Model:** [lakhera2023/devops-slm](https://huggingface.co/lakhera2023/devops-slm) """) with gr.Row(): with gr.Column(scale=2): prompt_input = gr.Textbox( label="DevOps Question or Task", placeholder="Ask me anything about DevOps, Kubernetes, Docker, CI/CD, or infrastructure...", lines=3 ) with gr.Row(): max_tokens = gr.Slider( minimum=50, maximum=500, value=200, step=10, label="Max Tokens" ) temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature" ) with gr.Row(): top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p" ) top_k = gr.Slider( minimum=1, maximum=100, value=50, step=1, label="Top-k" ) generate_btn = gr.Button("Generate Response", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("### 📝 Example Prompts") for i, example in enumerate(example_prompts[:4]): gr.Button( example, size="sm" ).click( lambda x=example: x, outputs=prompt_input ) with gr.Row(): output = gr.Textbox( label="DevOps Response", lines=10, show_copy_button=True ) # Event handlers generate_btn.click( fn=devops_slm.generate_response, inputs=[prompt_input, max_tokens, temperature, top_p, top_k], outputs=output ) # Allow Enter key to generate prompt_input.submit( fn=devops_slm.generate_response, inputs=[prompt_input, max_tokens, temperature, top_p, top_k], outputs=output ) gr.Markdown(""" ### 🎯 Model Capabilities - **Kubernetes Operations**: Pod management, deployments, services, configmaps, secrets - **Docker Containerization**: Container creation, optimization, and best practices - **CI/CD Pipeline Management**: Pipeline design, automation, and troubleshooting - **Infrastructure Automation**: Infrastructure as Code, provisioning, scaling - **Monitoring and Observability**: Logging, metrics, alerting, debugging - **Cloud Platform Operations**: Multi-cloud deployment and management ### 📊 Model Details - **Base Architecture**: Qwen (494M parameters) - **Specialization**: DevOps, Kubernetes, Docker, CI/CD, Infrastructure - **Max Sequence Length**: 2048 tokens - **Model Type**: Instruction-tuned for DevOps domain ### 🔌 API Integration This model is available via Hugging Face Inference Endpoint: - **Inference Endpoint**: `https://bcg2lrpnfylqamcz.us-east-1.aws.endpoints.huggingface.cloud` - **Format**: Standard Hugging Face inference API with `inputs` parameter - **Documentation**: See README.md for complete API examples """) return demo # Create Gradio interface demo = create_gradio_interface() # For Hugging Face Spaces, just launch Gradio directly if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False )