Spaces:
Running
Running
File size: 4,632 Bytes
a2e3298 cde2f6e a2e3298 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | import logging
from typing import Optional, Dict, Any, List, AsyncIterator
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import json
from config import config
from utils.json_extractor import extract_json_from_content
logger = logging.getLogger("text-service")
class TextService:
"""Service for text-based language model interactions"""
def __init__(self):
self.model: Optional[Llama] = None
async def initialize(self) -> None:
"""Initialize the text model"""
try:
logger.info(f"Downloading text model: {config.TEXT_MODEL_FILE}...")
model_path = hf_hub_download(
repo_id=config.TEXT_MODEL_REPO,
filename=config.TEXT_MODEL_FILE,
cache_dir=config.HF_HOME
)
logger.info(f"Loading text model (Threads: {config.N_THREADS})...")
self.model = Llama(
model_path=model_path,
n_ctx=config.TEXT_MODEL_CTX,
n_threads=config.N_THREADS,
n_batch=config.TEXT_MODEL_BATCH,
verbose=False
)
logger.info("✓ Text model loaded successfully")
except Exception as e:
logger.error(f"Failed to initialize text model: {e}")
raise
def is_ready(self) -> bool:
"""Check if the model is loaded and ready"""
return self.model is not None
async def generate_completion(
self,
messages: List[Dict[str, str]],
temperature: float = 0.6,
max_tokens: int = 512,
stream: bool = False,
return_json: bool = False
) -> Any:
"""
Generate text completion
Args:
messages: List of message dictionaries with 'role' and 'content'
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
stream: Whether to stream the response
return_json: Whether to extract JSON from response
Returns:
Generated completion (dict or stream)
"""
if not self.is_ready():
raise RuntimeError("Text model not initialized")
# Validate conflicting parameters
if stream and return_json:
raise ValueError("Cannot use both 'stream' and 'return_json' simultaneously")
# Prepare messages for JSON extraction mode
if return_json:
system_prompt = {
"role": "system",
"content": (
"You are a strict JSON generator. "
"Convert the user's input into valid JSON format. "
"Output strictly in markdown code blocks like ```json ... ```. "
"Do not add conversational filler."
)
}
messages = [system_prompt] + messages
if messages[-1]['role'] == 'user':
messages[-1]['content'] += "\n\nReturn structured JSON of this content."
logger.info(f"Generating completion: {len(messages)} messages | Stream: {stream}")
try:
response = self.model.create_chat_completion(
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
stream=stream
)
# Handle streaming response
if stream:
return self._create_stream_iterator(response)
# Handle JSON extraction
if return_json:
content_text = response['choices'][0]['message']['content']
extracted_data = extract_json_from_content(content_text)
return {
"status": "success",
"data": extracted_data
}
return response
except Exception as e:
logger.error(f"Error generating completion: {e}")
raise
async def _create_stream_iterator(self, response_stream) -> AsyncIterator[str]:
"""Create an async iterator for streaming responses"""
for chunk in response_stream:
yield f"data: {json.dumps(chunk)}\n\n"
yield "data: [DONE]\n\n"
async def cleanup(self) -> None:
"""Cleanup resources"""
if self.model:
del self.model
self.model = None
logger.info("Text model unloaded")
# Global instance
text_service = TextService() |