Spaces:

xce009
/

ai_chat_api

Running

File size: 4,632 Bytes

import logging
from typing import Optional, Dict, Any, List, AsyncIterator
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import json

from config import config
from utils.json_extractor import extract_json_from_content

logger = logging.getLogger("text-service")

class TextService:
    """Service for text-based language model interactions"""
    
    def __init__(self):
        self.model: Optional[Llama] = None
        
    async def initialize(self) -> None:
        """Initialize the text model"""
        try:
            logger.info(f"Downloading text model: {config.TEXT_MODEL_FILE}...")
            model_path = hf_hub_download(
                repo_id=config.TEXT_MODEL_REPO,
                filename=config.TEXT_MODEL_FILE,
                cache_dir=config.HF_HOME
            )
            
            logger.info(f"Loading text model (Threads: {config.N_THREADS})...")
            self.model = Llama(
                model_path=model_path,
                n_ctx=config.TEXT_MODEL_CTX,
                n_threads=config.N_THREADS,
                n_batch=config.TEXT_MODEL_BATCH,
                verbose=False
            )
            logger.info("✓ Text model loaded successfully")
            
        except Exception as e:
            logger.error(f"Failed to initialize text model: {e}")
            raise
    
    def is_ready(self) -> bool:
        """Check if the model is loaded and ready"""
        return self.model is not None
    
    async def generate_completion(
        self,
        messages: List[Dict[str, str]],
        temperature: float = 0.6,
        max_tokens: int = 512,
        stream: bool = False,
        return_json: bool = False
    ) -> Any:
        """
        Generate text completion
        
        Args:
            messages: List of message dictionaries with 'role' and 'content'
            temperature: Sampling temperature
            max_tokens: Maximum tokens to generate
            stream: Whether to stream the response
            return_json: Whether to extract JSON from response
            
        Returns:
            Generated completion (dict or stream)
        """
        if not self.is_ready():
            raise RuntimeError("Text model not initialized")
        
        # Validate conflicting parameters
        if stream and return_json:
            raise ValueError("Cannot use both 'stream' and 'return_json' simultaneously")
        
        # Prepare messages for JSON extraction mode
        if return_json:
            system_prompt = {
                "role": "system",
                "content": (
                    "You are a strict JSON generator. "
                    "Convert the user's input into valid JSON format. "
                    "Output strictly in markdown code blocks like ```json ... ```. "
                    "Do not add conversational filler."
                )
            }
            messages = [system_prompt] + messages
            
            if messages[-1]['role'] == 'user':
                messages[-1]['content'] += "\n\nReturn structured JSON of this content."
        
        logger.info(f"Generating completion: {len(messages)} messages | Stream: {stream}")
        
        try:
            response = self.model.create_chat_completion(
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                stream=stream
            )
            
            # Handle streaming response
            if stream:
                return self._create_stream_iterator(response)
            
            # Handle JSON extraction
            if return_json:
                content_text = response['choices'][0]['message']['content']
                extracted_data = extract_json_from_content(content_text)
                return {
                    "status": "success",
                    "data": extracted_data
                }
            
            return response
            
        except Exception as e:
            logger.error(f"Error generating completion: {e}")
            raise
    
    async def _create_stream_iterator(self, response_stream) -> AsyncIterator[str]:
        """Create an async iterator for streaming responses"""
        for chunk in response_stream:
            yield f"data: {json.dumps(chunk)}\n\n"
        yield "data: [DONE]\n\n"
    
    async def cleanup(self) -> None:
        """Cleanup resources"""
        if self.model:
            del self.model
            self.model = None
            logger.info("Text model unloaded")

# Global instance
text_service = TextService()