dvalle08 commited on
Commit
a16f712
·
1 Parent(s): 5cb235d

Refactor LLM and voice provider settings, enhance configuration options, and update dependencies

Browse files
.env.example CHANGED
@@ -1,25 +1,18 @@
1
-
2
- LLM_PROVIDER=nvidia # or "huggingface"
3
  NVIDIA_API_KEY=your_nvidia_api_key_here
4
  NVIDIA_MODEL=meta/llama-3.1-8b-instruct
5
 
6
- # Uncomment the following block when using HuggingFace instead of NVIDIA
7
- # LLM_PROVIDER=huggingface
8
- # HF_MODEL=microsoft/DialoGPT-medium
9
- # HF_TOKEN=your_huggingface_token_here # Get from: https://huggingface.co/settings/tokens
10
- # HF_USE_INFERENCE_API=false # true to use the Hugging Face Inference API, false to run locally
11
- # HF_TRUST_REMOTE_CODE=false # Enable when the repo requires custom model/tokenizer code
12
- # HF_USE_FAST_TOKENIZER=false # Set to true when you need the fast tokenizer; disable to avoid legacy conversion issues
13
 
 
 
 
14
 
15
- # Voice Provider Options
16
- # NVIDIA API uses the same NVIDIA_API_KEY as the LLM provider
17
  VOICE_PROVIDER=nvidia
18
-
19
- # NVIDIA Voice Settings (default)
20
  NVIDIA_VOICE_LANGUAGE=en-US
21
  NVIDIA_VOICE_NAME=Magpie-Multilingual.EN-US.Aria
22
 
23
  # NVIDIA TTS requires an endpoint from build.nvidia.com
24
- # Get your TTS endpoint from: https://build.nvidia.com/
25
  NVIDIA_TTS_ENDPOINT=https://your-tts-endpoint-here
 
1
+ # NVIDIA LLM Settings
 
2
  NVIDIA_API_KEY=your_nvidia_api_key_here
3
  NVIDIA_MODEL=meta/llama-3.1-8b-instruct
4
 
5
+ # HuggingFace Settings
6
+ HF_TOKEN=your_huggingface_token_here # Get from: https://huggingface.co/settings/tokens
 
 
 
 
 
7
 
8
+ # LLM Parameters
9
+ LLM_TEMPERATURE=0.7
10
+ LLM_MAX_TOKENS=1024
11
 
12
+ # Voice Provider Settings
 
13
  VOICE_PROVIDER=nvidia
 
 
14
  NVIDIA_VOICE_LANGUAGE=en-US
15
  NVIDIA_VOICE_NAME=Magpie-Multilingual.EN-US.Aria
16
 
17
  # NVIDIA TTS requires an endpoint from build.nvidia.com
 
18
  NVIDIA_TTS_ENDPOINT=https://your-tts-endpoint-here
pyproject.toml CHANGED
@@ -16,10 +16,10 @@ dependencies = [
16
  "python-dotenv>=1.0.0",
17
  "python-multipart>=0.0.22",
18
  "streamlit>=1.53.1",
19
- "transformers",
20
- "torch",
21
  "uvicorn[standard]>=0.40.0",
22
  "websockets>=16.0",
 
 
23
  "accelerate>=1.12.0",
24
  ]
25
 
 
16
  "python-dotenv>=1.0.0",
17
  "python-multipart>=0.0.22",
18
  "streamlit>=1.53.1",
 
 
19
  "uvicorn[standard]>=0.40.0",
20
  "websockets>=16.0",
21
+ "transformers>=4.32.0",
22
+ "torch>=2.1.1",
23
  "accelerate>=1.12.0",
24
  ]
25
 
src/agent/llm_factory.py CHANGED
@@ -1,7 +1,5 @@
1
- from typing import Dict, Optional
2
-
3
- from langchain_core.language_models import BaseLanguageModel
4
- from langchain_huggingface import HuggingFaceEndpoint, HuggingFacePipeline
5
  from langchain_nvidia_ai_endpoints import ChatNVIDIA
6
 
7
  from src.core.logger import logger
@@ -9,83 +7,62 @@ from src.core.settings import settings
9
 
10
 
11
  class LLMFactory:
12
- _instances: Dict[str, BaseLanguageModel] = {}
13
-
14
- @classmethod
15
- def create_llm(cls, provider: Optional[str] = None) -> BaseLanguageModel:
16
- provider = (provider or settings.llm.LLM_PROVIDER).lower()
17
-
18
- if provider in cls._instances:
19
- return cls._instances[provider]
20
-
21
- if provider == "nvidia":
22
- llm = cls._create_nvidia_llm()
23
- elif provider == "huggingface":
24
- llm = cls._create_huggingface_llm()
25
- else:
26
- raise ValueError(f"Unknown LLM provider: {provider}")
27
-
28
- cls._instances[provider] = llm
29
- return llm
30
-
31
- @classmethod
32
- def reset_cache(cls, provider: Optional[str] = None) -> None:
33
- if provider:
34
- cls._instances.pop(provider.lower(), None)
35
- else:
36
- cls._instances.clear()
37
-
38
  @staticmethod
39
- def _create_nvidia_llm() -> BaseLanguageModel:
40
- logger.info(f"Initializing NVIDIA LLM: {settings.llm.NVIDIA_MODEL}")
 
 
 
 
41
 
42
  if not settings.llm.NVIDIA_API_KEY:
43
  raise ValueError("NVIDIA_API_KEY must be set to use the NVIDIA LLM provider.")
44
 
45
  return ChatNVIDIA(
46
- model=settings.llm.NVIDIA_MODEL,
47
  api_key=settings.llm.NVIDIA_API_KEY,
48
- temperature=settings.llm.LLM_TEMPERATURE,
49
- max_completion_tokens=settings.llm.LLM_MAX_TOKENS,
50
  )
51
 
52
  @staticmethod
53
- def _create_huggingface_llm() -> BaseLanguageModel:
54
- model_id = settings.llm.HF_MODEL
55
- if not model_id:
56
- raise ValueError("HF_MODEL must be set when using the HuggingFace LLM provider.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- if settings.llm.HF_USE_INFERENCE_API:
59
- if not settings.llm.HF_TOKEN or not settings.llm.HF_TOKEN.strip():
60
- raise ValueError(
61
- "HF_TOKEN must be provided when HF_USE_INFERENCE_API is true."
62
- )
63
 
64
- logger.info(f"Initializing Hugging Face Inference API LLM: {model_id}")
65
- return HuggingFaceEndpoint(
66
- repo_id=model_id,
67
- huggingfacehub_api_token=settings.llm.HF_TOKEN,
68
- temperature=settings.llm.LLM_TEMPERATURE,
69
- max_new_tokens=settings.llm.LLM_MAX_TOKENS,
70
- )
71
 
72
- logger.info(f"Initializing local Hugging Face LLM: {model_id}")
73
- logger.info("Downloading model if not already cached...")
74
- return HuggingFacePipeline.from_model_id(
75
- model_id=model_id,
76
- task="text-generation",
77
- trust_remote_code=settings.llm.HF_TRUST_REMOTE_CODE,
78
- device_map="auto",
79
- model_kwargs={
80
- "temperature": settings.llm.LLM_TEMPERATURE,
81
- "do_sample": True,
82
- },
83
- pipeline_kwargs={
84
- "max_new_tokens": settings.llm.LLM_MAX_TOKENS,
85
- "temperature": settings.llm.LLM_TEMPERATURE,
86
- "do_sample": True,
87
- "tokenizer_kwargs": {
88
- "use_fast": settings.llm.HF_USE_FAST_TOKENIZER,
89
- },
90
- },
91
- )
 
1
+ from huggingface_hub import InferenceClient
2
+ from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
 
 
3
  from langchain_nvidia_ai_endpoints import ChatNVIDIA
4
 
5
  from src.core.logger import logger
 
7
 
8
 
9
  class LLMFactory:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  @staticmethod
11
+ def create_nvidia_llm(
12
+ model: str = settings.llm.NVIDIA_MODEL,
13
+ temperature: float = settings.llm.LLM_TEMPERATURE,
14
+ max_tokens: int = settings.llm.LLM_MAX_TOKENS,
15
+ ) -> ChatNVIDIA:
16
+ logger.info(f"Initializing NVIDIA LLM: {model}")
17
 
18
  if not settings.llm.NVIDIA_API_KEY:
19
  raise ValueError("NVIDIA_API_KEY must be set to use the NVIDIA LLM provider.")
20
 
21
  return ChatNVIDIA(
22
+ model=model,
23
  api_key=settings.llm.NVIDIA_API_KEY,
24
+ temperature=temperature,
25
+ max_completion_tokens=max_tokens,
26
  )
27
 
28
  @staticmethod
29
+ def create_huggingface_llm(
30
+ model_id: str,
31
+ provider: str = "auto",
32
+ temperature: float = settings.llm.LLM_TEMPERATURE,
33
+ max_tokens: int = settings.llm.LLM_MAX_TOKENS,
34
+ ) -> ChatHuggingFace:
35
+ token = (settings.llm.HF_TOKEN or "").strip()
36
+ if not token:
37
+ raise ValueError("HF_TOKEN must be set to use the HuggingFace LLM provider.")
38
+
39
+ logger.info(f"Initializing HuggingFace LLM: {model_id} via provider={provider}")
40
+
41
+ llm = HuggingFaceEndpoint(
42
+ repo_id=model_id,
43
+ provider=provider,
44
+ huggingfacehub_api_token=token,
45
+ temperature=temperature,
46
+ max_new_tokens=max_tokens,
47
+ )
48
+ return ChatHuggingFace(llm=llm)
49
+
50
+ @staticmethod
51
+ def create_huggingface_stt(model_id: str | None = None) -> InferenceClient:
52
+ token = (settings.llm.HF_TOKEN or "").strip()
53
+ if not token:
54
+ raise ValueError("HF_TOKEN must be set to use the HuggingFace STT provider.")
55
 
56
+ logger.info(f"Initializing HuggingFace STT: {model_id or 'default'}")
 
 
 
 
57
 
58
+ return InferenceClient(model=model_id, token=token)
 
 
 
 
 
 
59
 
60
+ @staticmethod
61
+ def create_huggingface_tts(model_id: str | None = None) -> InferenceClient:
62
+ token = (settings.llm.HF_TOKEN or "").strip()
63
+ if not token:
64
+ raise ValueError("HF_TOKEN must be set to use the HuggingFace TTS provider.")
65
+
66
+ logger.info(f"Initializing HuggingFace TTS: {model_id or 'default'}")
67
+
68
+ return InferenceClient(model=model_id, token=token)
 
 
 
 
 
 
 
 
 
 
 
src/core/settings.py CHANGED
@@ -2,7 +2,7 @@ import json
2
  from pathlib import Path
3
  from typing import Any, Optional
4
 
5
- from pydantic import Field, ValidationError, field_validator
6
  from pydantic_settings import BaseSettings, SettingsConfigDict
7
  from dotenv import load_dotenv
8
 
@@ -61,28 +61,14 @@ class VoiceSettings(CoreSettings):
61
 
62
 
63
  class LLMSettings(CoreSettings):
64
- LLM_PROVIDER: str = Field(default="nvidia")
65
-
66
  NVIDIA_API_KEY: Optional[str] = Field(default=None)
67
  NVIDIA_MODEL: str = Field(default="meta/llama-3.1-8b-instruct")
68
  NVIDIA_BASE_URL: str = Field(default="https://integrate.api.nvidia.com/v1")
69
-
70
  HF_TOKEN: Optional[str] = Field(default=None)
71
- HF_MODEL: Optional[str] = Field(default="TheBloke/Llama-2-7B-Chat-GGUF")
72
- HF_USE_INFERENCE_API: bool = Field(default=False)
73
- HF_TRUST_REMOTE_CODE: bool = Field(default=False)
74
- HF_USE_FAST_TOKENIZER: bool = Field(default=False)
75
-
76
  LLM_TEMPERATURE: float = Field(default=0.7, ge=0.0, le=2.0)
77
  LLM_MAX_TOKENS: int = Field(default=1024, gt=0)
78
- LLM_STREAMING: bool = Field(default=True)
79
-
80
- @field_validator("LLM_PROVIDER")
81
- @classmethod
82
- def validate_provider(cls, v: str) -> str:
83
- if v.lower() not in ["nvidia", "huggingface"]:
84
- raise ValueError("LLM_PROVIDER must be 'nvidia' or 'huggingface'")
85
- return v.lower()
86
 
87
 
88
  class APISettings(CoreSettings):
 
2
  from pathlib import Path
3
  from typing import Any, Optional
4
 
5
+ from pydantic import Field, ValidationError
6
  from pydantic_settings import BaseSettings, SettingsConfigDict
7
  from dotenv import load_dotenv
8
 
 
61
 
62
 
63
  class LLMSettings(CoreSettings):
 
 
64
  NVIDIA_API_KEY: Optional[str] = Field(default=None)
65
  NVIDIA_MODEL: str = Field(default="meta/llama-3.1-8b-instruct")
66
  NVIDIA_BASE_URL: str = Field(default="https://integrate.api.nvidia.com/v1")
67
+
68
  HF_TOKEN: Optional[str] = Field(default=None)
69
+
 
 
 
 
70
  LLM_TEMPERATURE: float = Field(default=0.7, ge=0.0, le=2.0)
71
  LLM_MAX_TOKENS: int = Field(default=1024, gt=0)
 
 
 
 
 
 
 
 
72
 
73
 
74
  class APISettings(CoreSettings):
src/models/voice/factory.py DELETED
@@ -1,41 +0,0 @@
1
- from typing import Dict, Type, Callable
2
-
3
- from src.core.logger import logger
4
- from src.core.settings import settings
5
- from src.models.voice.base import BaseVoiceProvider, VoiceProviderConfig
6
- from src.models.voice.nvidia import NvidiaVoiceProvider, NvidiaConfig
7
-
8
-
9
- class VoiceProviderFactory:
10
- _registry: Dict[str, Callable[[VoiceProviderConfig], BaseVoiceProvider]] = {}
11
-
12
- @classmethod
13
- def register(cls, name: str, provider_class: Type[BaseVoiceProvider]) -> None:
14
- cls._registry[name.lower()] = provider_class
15
- logger.debug(f"Registered voice provider: {name}")
16
-
17
- @classmethod
18
- def create_provider(cls, provider_name: str = None) -> BaseVoiceProvider:
19
- provider_name = provider_name or settings.voice.VOICE_PROVIDER
20
- provider_name = provider_name.lower()
21
-
22
- if provider_name not in cls._registry:
23
- raise ValueError(f"Unknown voice provider: {provider_name}. Available: {list(cls._registry.keys())}")
24
-
25
- logger.info(f"Creating voice provider: {provider_name}")
26
-
27
- if provider_name == "nvidia":
28
- config = NvidiaConfig(
29
- api_key=settings.llm.NVIDIA_API_KEY,
30
- language=settings.voice.NVIDIA_VOICE_LANGUAGE,
31
- voice_name=settings.voice.NVIDIA_VOICE_NAME,
32
- tts_model=settings.voice.NVIDIA_TTS_MODEL,
33
- tts_endpoint=settings.voice.NVIDIA_TTS_ENDPOINT,
34
- sample_rate_output=settings.voice.SAMPLE_RATE_OUTPUT,
35
- )
36
- return cls._registry[provider_name](config)
37
-
38
- raise NotImplementedError(f"Configuration for {provider_name} not yet implemented")
39
-
40
-
41
- VoiceProviderFactory.register("nvidia", NvidiaVoiceProvider)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/models/voice/nvidia.py DELETED
@@ -1,91 +0,0 @@
1
- import asyncio
2
- from typing import AsyncIterator, Optional
3
-
4
- import httpx
5
-
6
- from src.core.logger import logger
7
- from src.models.voice.base import BaseVoiceProvider, VoiceProviderConfig
8
- from src.models.voice.types import TranscriptionResult, VADInfo
9
-
10
-
11
- class NvidiaConfig(VoiceProviderConfig):
12
- provider_name: str = "nvidia"
13
- api_key: str
14
- language: str = "en-US"
15
- voice_name: str = "Magpie-Multilingual.EN-US.Aria"
16
- tts_model: str = "magpie-tts-multilingual"
17
- tts_endpoint: str = ""
18
- sample_rate_output: int = 48000
19
-
20
-
21
- class NvidiaVoiceProvider(BaseVoiceProvider):
22
- def __init__(self, config: NvidiaConfig):
23
- super().__init__(config)
24
- self.config: NvidiaConfig = config
25
- self._current_vad: Optional[VADInfo] = None
26
-
27
- async def connect(self) -> None:
28
- # No connection needed for HTTP API
29
- self._connected = True
30
- logger.info("NVIDIA API TTS provider ready")
31
-
32
- async def disconnect(self) -> None:
33
- self._connected = False
34
- logger.info("NVIDIA API TTS provider disconnected")
35
-
36
- async def text_to_speech(
37
- self, text: str, stream: bool = True
38
- ) -> AsyncIterator[bytes]:
39
- if not self.is_connected:
40
- raise RuntimeError("NVIDIA API provider not connected")
41
-
42
- if not self.config.tts_endpoint:
43
- raise RuntimeError(
44
- "TTS requires NVIDIA_TTS_ENDPOINT to be set. "
45
- "Get a TTS endpoint from: https://build.nvidia.com/"
46
- )
47
-
48
- async for chunk in self._text_to_speech_http(text, stream):
49
- yield chunk
50
-
51
- async def _text_to_speech_http(
52
- self, text: str, stream: bool = True
53
- ) -> AsyncIterator[bytes]:
54
- endpoint = self.config.tts_endpoint.rstrip("/")
55
- url = f"{endpoint}/v1/audio/synthesize"
56
-
57
- try:
58
- logger.debug(f"Generating speech via HTTP API for text: {text[:50]}...")
59
-
60
- headers = {
61
- "Authorization": f"Bearer {self.config.api_key}",
62
- "Content-Type": "application/json"
63
- }
64
-
65
- payload = {
66
- "language": self.config.language,
67
- "text": text,
68
- "voice": self.config.voice_name,
69
- "sample_rate_hz": self.config.sample_rate_output,
70
- }
71
-
72
- async with httpx.AsyncClient(timeout=60.0) as client:
73
- response = await client.post(url, json=payload, headers=headers)
74
- response.raise_for_status()
75
-
76
- # For streaming, we need to handle the response appropriately
77
- # For now, return the full content
78
- yield response.content
79
-
80
- logger.debug("HTTP TTS generation complete")
81
-
82
- except httpx.HTTPStatusError as e:
83
- logger.error(f"HTTP error in NVIDIA TTS API: {e.response.status_code} - {e.response.text}")
84
- raise RuntimeError(f"NVIDIA TTS API error: {e.response.status_code}") from e
85
- except Exception as e:
86
- logger.error(f"Error in NVIDIA HTTP TTS API: {e}")
87
- raise
88
-
89
-
90
- async def get_vad_info(self) -> Optional[VADInfo]:
91
- return self._current_vad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
uv.lock CHANGED
@@ -1099,8 +1099,8 @@ requires-dist = [
1099
  { name = "python-dotenv", specifier = ">=1.0.0" },
1100
  { name = "python-multipart", specifier = ">=0.0.22" },
1101
  { name = "streamlit", specifier = ">=1.53.1" },
1102
- { name = "torch" },
1103
- { name = "transformers" },
1104
  { name = "uvicorn", extras = ["standard"], specifier = ">=0.40.0" },
1105
  { name = "websockets", specifier = ">=16.0" },
1106
  ]
 
1099
  { name = "python-dotenv", specifier = ">=1.0.0" },
1100
  { name = "python-multipart", specifier = ">=0.0.22" },
1101
  { name = "streamlit", specifier = ">=1.53.1" },
1102
+ { name = "torch", specifier = ">=2.1.1" },
1103
+ { name = "transformers", specifier = ">=4.32.0" },
1104
  { name = "uvicorn", extras = ["standard"], specifier = ">=0.40.0" },
1105
  { name = "websockets", specifier = ">=16.0" },
1106
  ]