Spaces:
Running
Running
| import sys | |
| import os | |
| import json | |
| import requests | |
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| from typing import List, Optional, Dict, Any | |
| import uvicorn | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import logging | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # FastAPI app for hosting the Gradio interface | |
| app = FastAPI(title="DevOps SLM Interface") | |
| # Your Hugging Face endpoint details | |
| HF_ENDPOINT_URL = "https://bcg2lrpnfylqamcz.us-east-1.aws.endpoints.huggingface.cloud" | |
| HF_API_TOKEN = os.getenv("HF_API_TOKEN") # Must be set in Space environment variables | |
| # User-friendly API base URL | |
| API_BASE_URL = "https://lakhera2023-devops-slm-chat.hf.space" | |
| # Pydantic models for OpenAI API | |
| class CompletionRequest(BaseModel): | |
| prompt: str | |
| max_tokens: Optional[int] = 100 | |
| temperature: Optional[float] = 0.7 | |
| top_p: Optional[float] = 1.0 | |
| frequency_penalty: Optional[float] = 0.0 | |
| presence_penalty: Optional[float] = 0.0 | |
| stop: Optional[List[str]] = None | |
| stream: Optional[bool] = False | |
| class ChatMessage(BaseModel): | |
| role: str | |
| content: str | |
| class ChatCompletionRequest(BaseModel): | |
| messages: List[ChatMessage] | |
| max_tokens: Optional[int] = 100 | |
| temperature: Optional[float] = 0.7 | |
| top_p: Optional[float] = 1.0 | |
| frequency_penalty: Optional[float] = 0.0 | |
| presence_penalty: Optional[float] = 0.0 | |
| stop: Optional[List[str]] = None | |
| stream: Optional[bool] = False | |
| # DevOps SLM Inference Class (from your existing code) | |
| class DevOpsSLMInference: | |
| def __init__(self): | |
| self.model = None | |
| self.tokenizer = None | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.model_name = "lakhera2023/devops-slm" | |
| def load_model(self): | |
| """Load the DevOps SLM model""" | |
| try: | |
| logger.info(f"Loading model: {self.model_name}") | |
| logger.info(f"Using device: {self.device}") | |
| # Load tokenizer | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| # Load model | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.model_name, | |
| torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, | |
| device_map="auto" if self.device == "cuda" else None, | |
| low_cpu_mem_usage=True, | |
| trust_remote_code=True | |
| ) | |
| # Set pad token if not present | |
| if self.tokenizer.pad_token is None: | |
| self.tokenizer.pad_token = self.tokenizer.eos_token | |
| logger.info("Model loaded successfully!") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error loading model: {e}") | |
| return False | |
| def generate_response(self, prompt, max_tokens=200, temperature=0.7, top_p=0.9, top_k=50): | |
| """Generate a response using the DevOps SLM""" | |
| if self.model is None or self.tokenizer is None: | |
| if not self.load_model(): | |
| return "Error: Could not load model" | |
| try: | |
| # Tokenize input | |
| inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) | |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} | |
| # Generate response | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=top_k, | |
| do_sample=True, | |
| num_return_sequences=1, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| eos_token_id=self.tokenizer.eos_token_id, | |
| repetition_penalty=1.1, | |
| no_repeat_ngram_size=2 | |
| ) | |
| # Decode response | |
| response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Remove the input prompt from the response | |
| if response.startswith(prompt): | |
| response = response[len(prompt):].strip() | |
| # Clean up template artifacts | |
| response = response.replace("<|im_start|>", "").replace("<|im_end|>", "").strip() | |
| return response | |
| except Exception as e: | |
| logger.error(f"Error generating response: {e}") | |
| return f"Error: {str(e)}" | |
| # Initialize the inference class | |
| devops_slm = DevOpsSLMInference() | |
| # Utility functions for OpenAI API | |
| def count_tokens(text: str) -> int: | |
| """Simple token counting - you might want to use a proper tokenizer""" | |
| return len(text.split()) * 1.3 # Rough approximation | |
| def call_hf_endpoint(prompt: str, max_tokens: int = 100) -> Dict[str, Any]: | |
| """Call the Hugging Face endpoint""" | |
| headers = { | |
| "Authorization": f"Bearer {HF_API_TOKEN}", | |
| "Content-Type": "application/json" | |
| } | |
| data = { | |
| "inputs": prompt, | |
| "parameters": { | |
| "max_new_tokens": max_tokens, | |
| "temperature": 0.7, | |
| "do_sample": True, | |
| "return_full_text": False | |
| } | |
| } | |
| try: | |
| response = requests.post(HF_ENDPOINT_URL, headers=headers, json=data, timeout=60) | |
| response.raise_for_status() | |
| return response.json() | |
| except requests.exceptions.RequestException as e: | |
| raise HTTPException(status_code=500, detail=f"Hugging Face API error: {str(e)}") | |
| # Simple health check endpoint | |
| async def health_check(): | |
| """Health check endpoint""" | |
| return {"status": "healthy", "model": "devops-slm", "interface": "gradio"} | |
| async def api_info(): | |
| """API information endpoint""" | |
| return { | |
| "message": "DevOps SLM - Use Inference Endpoint for API Access", | |
| "inference_endpoint": "https://bcg2lrpnfylqamcz.us-east-1.aws.endpoints.huggingface.cloud", | |
| "ui_url": "https://huggingface.co/spaces/lakhera2023/devops-slm-chat/ui", | |
| "note": "For API access, use the Hugging Face Inference Endpoint. See README.md for examples." | |
| } | |
| # Gradio interface (from your existing code) | |
| example_prompts = [ | |
| "How do I deploy a microservice to Kubernetes?", | |
| "What are the best practices for container security?", | |
| "How can I monitor application performance in production?", | |
| "Explain the difference between Docker and Kubernetes", | |
| "What is CI/CD and how do I implement it?", | |
| "Create a Kubernetes deployment YAML for a web application", | |
| "How do I set up a Docker multi-stage build?", | |
| "What are the key components of a DevOps pipeline?" | |
| ] | |
| def create_gradio_interface(): | |
| """Create the Gradio interface""" | |
| with gr.Blocks( | |
| title="DevOps SLM - Specialized Language Model", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| """ | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π DevOps Specialized Language Model | |
| A specialized AI model trained for DevOps tasks, Kubernetes operations, Docker containerization, | |
| CI/CD pipelines, and infrastructure management. | |
| **Model:** [lakhera2023/devops-slm](https://huggingface.co/lakhera2023/devops-slm) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| prompt_input = gr.Textbox( | |
| label="DevOps Question or Task", | |
| placeholder="Ask me anything about DevOps, Kubernetes, Docker, CI/CD, or infrastructure...", | |
| lines=3 | |
| ) | |
| with gr.Row(): | |
| max_tokens = gr.Slider( | |
| minimum=50, maximum=500, value=200, step=10, | |
| label="Max Tokens" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, maximum=2.0, value=0.7, step=0.1, | |
| label="Temperature" | |
| ) | |
| with gr.Row(): | |
| top_p = gr.Slider( | |
| minimum=0.1, maximum=1.0, value=0.9, step=0.05, | |
| label="Top-p" | |
| ) | |
| top_k = gr.Slider( | |
| minimum=1, maximum=100, value=50, step=1, | |
| label="Top-k" | |
| ) | |
| generate_btn = gr.Button("Generate Response", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Example Prompts") | |
| for i, example in enumerate(example_prompts[:4]): | |
| gr.Button( | |
| example, | |
| size="sm" | |
| ).click( | |
| lambda x=example: x, | |
| outputs=prompt_input | |
| ) | |
| with gr.Row(): | |
| output = gr.Textbox( | |
| label="DevOps Response", | |
| lines=10, | |
| show_copy_button=True | |
| ) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=devops_slm.generate_response, | |
| inputs=[prompt_input, max_tokens, temperature, top_p, top_k], | |
| outputs=output | |
| ) | |
| # Allow Enter key to generate | |
| prompt_input.submit( | |
| fn=devops_slm.generate_response, | |
| inputs=[prompt_input, max_tokens, temperature, top_p, top_k], | |
| outputs=output | |
| ) | |
| gr.Markdown(""" | |
| ### π― Model Capabilities | |
| - **Kubernetes Operations**: Pod management, deployments, services, configmaps, secrets | |
| - **Docker Containerization**: Container creation, optimization, and best practices | |
| - **CI/CD Pipeline Management**: Pipeline design, automation, and troubleshooting | |
| - **Infrastructure Automation**: Infrastructure as Code, provisioning, scaling | |
| - **Monitoring and Observability**: Logging, metrics, alerting, debugging | |
| - **Cloud Platform Operations**: Multi-cloud deployment and management | |
| ### π Model Details | |
| - **Base Architecture**: Qwen (494M parameters) | |
| - **Specialization**: DevOps, Kubernetes, Docker, CI/CD, Infrastructure | |
| - **Max Sequence Length**: 2048 tokens | |
| - **Model Type**: Instruction-tuned for DevOps domain | |
| ### π API Integration | |
| This model is available via Hugging Face Inference Endpoint: | |
| - **Inference Endpoint**: `https://bcg2lrpnfylqamcz.us-east-1.aws.endpoints.huggingface.cloud` | |
| - **Format**: Standard Hugging Face inference API with `inputs` parameter | |
| - **Documentation**: See README.md for complete API examples | |
| """) | |
| return demo | |
| # Create Gradio interface | |
| demo = create_gradio_interface() | |
| # For Hugging Face Spaces, just launch Gradio directly | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) |