Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -57,11 +57,15 @@ is_initialized = False
|
|
| 57 |
class Config:
|
| 58 |
# OpenAI Compatible API Configuration
|
| 59 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 60 |
-
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
# Model Configuration
|
| 63 |
-
LLM_MODEL = os.getenv("LLM_MODEL", "gpt-3.5-turbo")
|
| 64 |
-
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-ada-002")
|
| 65 |
|
| 66 |
# Document Processing
|
| 67 |
CHUNK_SIZE = 500
|
|
@@ -70,12 +74,12 @@ class Config:
|
|
| 70 |
# Rate Limiting
|
| 71 |
MAX_RETRIES = 5
|
| 72 |
RATE_LIMIT_DELAY = 2.0
|
| 73 |
-
EMBEDDING_BATCH_SIZE = 10
|
| 74 |
-
EMBEDDING_DELAY = 1.0
|
| 75 |
|
| 76 |
# Model Parameters
|
| 77 |
TEMPERATURE = 0.5
|
| 78 |
-
MAX_OUTPUT_TOKENS =
|
| 79 |
RETRIEVER_K = 10
|
| 80 |
|
| 81 |
# Paths
|
|
@@ -86,7 +90,7 @@ config = Config()
|
|
| 86 |
|
| 87 |
# Request/Response Models
|
| 88 |
class QueryRequest(BaseModel):
|
| 89 |
-
query: str = Field(..., min_length=1, max_length=
|
| 90 |
|
| 91 |
class QueryResponse(BaseModel):
|
| 92 |
answer: str
|
|
@@ -100,14 +104,18 @@ class SystemStatus(BaseModel):
|
|
| 100 |
is_initialized: bool
|
| 101 |
model_name: str
|
| 102 |
embedding_model: str
|
| 103 |
-
|
|
|
|
|
|
|
| 104 |
vector_store_ready: bool
|
| 105 |
total_chunks: int = 0
|
| 106 |
api_key_configured: bool
|
| 107 |
|
| 108 |
class InitializeRequest(BaseModel):
|
| 109 |
api_key: str = Field(..., min_length=1)
|
| 110 |
-
|
|
|
|
|
|
|
| 111 |
llm_model: Optional[str] = Field(default=None, description="LLM model name")
|
| 112 |
embedding_model: Optional[str] = Field(default=None, description="Embedding model name")
|
| 113 |
|
|
@@ -125,7 +133,7 @@ def estimate_tokens(text: str) -> int:
|
|
| 125 |
except:
|
| 126 |
return len(text.split()) * 1.3 # Rough estimate
|
| 127 |
|
| 128 |
-
# Rate limiting helper functions
|
| 129 |
async def rate_limited_embedding_creation(chunks, embeddings):
|
| 130 |
"""Create embeddings with rate limiting to avoid API limits."""
|
| 131 |
logger.info(f"Creating embeddings for {len(chunks)} chunks with rate limiting...")
|
|
@@ -188,7 +196,7 @@ async def rate_limited_embedding_creation(chunks, embeddings):
|
|
| 188 |
logger.info("Successfully created and merged all embeddings")
|
| 189 |
return final_vector_store
|
| 190 |
|
| 191 |
-
# Custom Callback Handler for OpenAI
|
| 192 |
class TokenUsageCallbackHandler(BaseCallbackHandler):
|
| 193 |
"""Callback handler to track token usage in OpenAI calls."""
|
| 194 |
|
|
@@ -239,7 +247,14 @@ class TokenUsageCallbackHandler(BaseCallbackHandler):
|
|
| 239 |
}
|
| 240 |
|
| 241 |
# RAG System Functions
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
"""Initialize or reinitialize the RAG system with OpenAI compatible API."""
|
| 244 |
global vector_store, qa_chain, token_callback_handler, is_initialized, config
|
| 245 |
|
|
@@ -247,13 +262,14 @@ async def initialize_rag_system(api_key: str = None, base_url: str = None, llm_m
|
|
| 247 |
# Update configuration
|
| 248 |
if api_key:
|
| 249 |
config.OPENAI_API_KEY = api_key
|
| 250 |
-
os.environ["OPENAI_API_KEY"] = api_key
|
| 251 |
elif not config.OPENAI_API_KEY:
|
| 252 |
raise ValueError("OpenAI API key not provided")
|
| 253 |
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
|
|
|
|
|
|
| 257 |
|
| 258 |
if llm_model:
|
| 259 |
config.LLM_MODEL = llm_model
|
|
@@ -261,8 +277,10 @@ async def initialize_rag_system(api_key: str = None, base_url: str = None, llm_m
|
|
| 261 |
if embedding_model:
|
| 262 |
config.EMBEDDING_MODEL = embedding_model
|
| 263 |
|
|
|
|
| 264 |
logger.info(f"Initializing RAG system with:")
|
| 265 |
-
logger.info(f" - Base URL: {config.
|
|
|
|
| 266 |
logger.info(f" - LLM Model: {config.LLM_MODEL}")
|
| 267 |
logger.info(f" - Embedding Model: {config.EMBEDDING_MODEL}")
|
| 268 |
|
|
@@ -284,16 +302,15 @@ async def initialize_rag_system(api_key: str = None, base_url: str = None, llm_m
|
|
| 284 |
chunks = text_splitter.split_documents(documents)
|
| 285 |
logger.info(f"Document split into {len(chunks)} chunks")
|
| 286 |
|
| 287 |
-
# Check if we have too many chunks that might cause rate limiting
|
| 288 |
if len(chunks) > 200:
|
| 289 |
logger.warning(f"Large number of chunks ({len(chunks)}). Consider increasing chunk_size to reduce API calls.")
|
| 290 |
|
| 291 |
-
# Initialize OpenAI embeddings
|
| 292 |
embeddings = OpenAIEmbeddings(
|
| 293 |
model=config.EMBEDDING_MODEL,
|
| 294 |
openai_api_key=config.OPENAI_API_KEY,
|
| 295 |
-
openai_api_base=config.
|
| 296 |
-
chunk_size=1000
|
| 297 |
)
|
| 298 |
|
| 299 |
# Test embedding connection
|
|
@@ -326,11 +343,11 @@ async def initialize_rag_system(api_key: str = None, base_url: str = None, llm_m
|
|
| 326 |
vector_store.save_local(config.INDEX_PATH)
|
| 327 |
logger.info(f"Created new FAISS index at '{config.INDEX_PATH}'")
|
| 328 |
|
| 329 |
-
# Initialize OpenAI LLM
|
| 330 |
llm = ChatOpenAI(
|
| 331 |
model_name=config.LLM_MODEL,
|
| 332 |
openai_api_key=config.OPENAI_API_KEY,
|
| 333 |
-
openai_api_base=config.
|
| 334 |
temperature=config.TEMPERATURE,
|
| 335 |
max_tokens=config.MAX_OUTPUT_TOKENS,
|
| 336 |
callbacks=[token_callback_handler],
|
|
@@ -339,6 +356,9 @@ async def initialize_rag_system(api_key: str = None, base_url: str = None, llm_m
|
|
| 339 |
|
| 340 |
# Test LLM connection
|
| 341 |
try:
|
|
|
|
|
|
|
|
|
|
| 342 |
test_response = llm.invoke("Test connection")
|
| 343 |
logger.info("Successfully connected to LLM API")
|
| 344 |
except Exception as e:
|
|
@@ -390,6 +410,7 @@ async def startup_event():
|
|
| 390 |
"""Initialize the system on startup if API key is available."""
|
| 391 |
if config.OPENAI_API_KEY:
|
| 392 |
try:
|
|
|
|
| 393 |
await initialize_rag_system()
|
| 394 |
except Exception as e:
|
| 395 |
logger.warning(f"Could not initialize on startup: {str(e)}")
|
|
@@ -424,7 +445,9 @@ async def get_status():
|
|
| 424 |
is_initialized=is_initialized,
|
| 425 |
model_name=config.LLM_MODEL,
|
| 426 |
embedding_model=config.EMBEDDING_MODEL,
|
| 427 |
-
|
|
|
|
|
|
|
| 428 |
vector_store_ready=vector_store is not None,
|
| 429 |
total_chunks=len(vector_store.docstore._dict) if vector_store else 0,
|
| 430 |
api_key_configured=bool(config.OPENAI_API_KEY)
|
|
@@ -434,17 +457,21 @@ async def get_status():
|
|
| 434 |
async def initialize_system(request: InitializeRequest):
|
| 435 |
"""Initialize the RAG system with provided API key and configuration."""
|
| 436 |
try:
|
|
|
|
| 437 |
await initialize_rag_system(
|
| 438 |
api_key=request.api_key,
|
| 439 |
-
|
|
|
|
| 440 |
llm_model=request.llm_model,
|
| 441 |
embedding_model=request.embedding_model
|
| 442 |
)
|
|
|
|
| 443 |
return {
|
| 444 |
"success": True,
|
| 445 |
"message": "System initialized successfully",
|
| 446 |
"config": {
|
| 447 |
-
"
|
|
|
|
| 448 |
"llm_model": config.LLM_MODEL,
|
| 449 |
"embedding_model": config.EMBEDDING_MODEL
|
| 450 |
}
|
|
@@ -512,6 +539,8 @@ async def process_query(request: QueryRequest):
|
|
| 512 |
logger.error(f"Error processing query: {str(e)}")
|
| 513 |
raise HTTPException(status_code=500, detail=str(e))
|
| 514 |
|
|
|
|
|
|
|
| 515 |
@app.get("/api/token-stats", response_model=Dict[str, Any])
|
| 516 |
async def get_token_stats():
|
| 517 |
"""Get token usage statistics."""
|
|
@@ -570,8 +599,10 @@ async def health_check():
|
|
| 570 |
@app.get("/api/config")
|
| 571 |
async def get_config():
|
| 572 |
"""Get current configuration."""
|
|
|
|
| 573 |
return {
|
| 574 |
-
"
|
|
|
|
| 575 |
"llm_model": config.LLM_MODEL,
|
| 576 |
"embedding_model": config.EMBEDDING_MODEL,
|
| 577 |
"chunk_size": config.CHUNK_SIZE,
|
|
|
|
| 57 |
class Config:
|
| 58 |
# OpenAI Compatible API Configuration
|
| 59 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 60 |
+
# REMOVED: OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
|
| 61 |
+
|
| 62 |
+
# ADDED: Separate base URLs for LLM and Embeddings
|
| 63 |
+
LLM_BASE_URL = os.getenv("LLM_BASE_URL", "https://api.openai.com/v1")
|
| 64 |
+
EMBEDDING_BASE_URL = os.getenv("EMBEDDING_BASE_URL", "https://api.openai.com/v1")
|
| 65 |
|
| 66 |
# Model Configuration
|
| 67 |
+
LLM_MODEL = os.getenv("LLM_MODEL", "gpt-3.5-turbo")
|
| 68 |
+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-ada-002")
|
| 69 |
|
| 70 |
# Document Processing
|
| 71 |
CHUNK_SIZE = 500
|
|
|
|
| 74 |
# Rate Limiting
|
| 75 |
MAX_RETRIES = 5
|
| 76 |
RATE_LIMIT_DELAY = 2.0
|
| 77 |
+
EMBEDDING_BATCH_SIZE = 10
|
| 78 |
+
EMBEDDING_DELAY = 1.0
|
| 79 |
|
| 80 |
# Model Parameters
|
| 81 |
TEMPERATURE = 0.5
|
| 82 |
+
MAX_OUTPUT_TOKENS = 2000
|
| 83 |
RETRIEVER_K = 10
|
| 84 |
|
| 85 |
# Paths
|
|
|
|
| 90 |
|
| 91 |
# Request/Response Models
|
| 92 |
class QueryRequest(BaseModel):
|
| 93 |
+
query: str = Field(..., min_length=1, max_length=10000)
|
| 94 |
|
| 95 |
class QueryResponse(BaseModel):
|
| 96 |
answer: str
|
|
|
|
| 104 |
is_initialized: bool
|
| 105 |
model_name: str
|
| 106 |
embedding_model: str
|
| 107 |
+
# CHANGED: Use separate URLs
|
| 108 |
+
llm_base_url: str
|
| 109 |
+
embedding_base_url: str
|
| 110 |
vector_store_ready: bool
|
| 111 |
total_chunks: int = 0
|
| 112 |
api_key_configured: bool
|
| 113 |
|
| 114 |
class InitializeRequest(BaseModel):
|
| 115 |
api_key: str = Field(..., min_length=1)
|
| 116 |
+
# CHANGED: Accept separate URLs
|
| 117 |
+
llm_base_url: Optional[str] = Field(default=None, description="LLM (text generation) API base URL")
|
| 118 |
+
embedding_base_url: Optional[str] = Field(default=None, description="Embedding model API base URL")
|
| 119 |
llm_model: Optional[str] = Field(default=None, description="LLM model name")
|
| 120 |
embedding_model: Optional[str] = Field(default=None, description="Embedding model name")
|
| 121 |
|
|
|
|
| 133 |
except:
|
| 134 |
return len(text.split()) * 1.3 # Rough estimate
|
| 135 |
|
| 136 |
+
# Rate limiting helper functions (No changes needed here)
|
| 137 |
async def rate_limited_embedding_creation(chunks, embeddings):
|
| 138 |
"""Create embeddings with rate limiting to avoid API limits."""
|
| 139 |
logger.info(f"Creating embeddings for {len(chunks)} chunks with rate limiting...")
|
|
|
|
| 196 |
logger.info("Successfully created and merged all embeddings")
|
| 197 |
return final_vector_store
|
| 198 |
|
| 199 |
+
# Custom Callback Handler for OpenAI (No changes needed here)
|
| 200 |
class TokenUsageCallbackHandler(BaseCallbackHandler):
|
| 201 |
"""Callback handler to track token usage in OpenAI calls."""
|
| 202 |
|
|
|
|
| 247 |
}
|
| 248 |
|
| 249 |
# RAG System Functions
|
| 250 |
+
# CHANGED: Function signature to accept separate URLs
|
| 251 |
+
async def initialize_rag_system(
|
| 252 |
+
api_key: str = None,
|
| 253 |
+
llm_base_url: str = None,
|
| 254 |
+
embedding_base_url: str = None,
|
| 255 |
+
llm_model: str = None,
|
| 256 |
+
embedding_model: str = None
|
| 257 |
+
):
|
| 258 |
"""Initialize or reinitialize the RAG system with OpenAI compatible API."""
|
| 259 |
global vector_store, qa_chain, token_callback_handler, is_initialized, config
|
| 260 |
|
|
|
|
| 262 |
# Update configuration
|
| 263 |
if api_key:
|
| 264 |
config.OPENAI_API_KEY = api_key
|
|
|
|
| 265 |
elif not config.OPENAI_API_KEY:
|
| 266 |
raise ValueError("OpenAI API key not provided")
|
| 267 |
|
| 268 |
+
# CHANGED: Update separate base URLs
|
| 269 |
+
if llm_base_url:
|
| 270 |
+
config.LLM_BASE_URL = llm_base_url
|
| 271 |
+
if embedding_base_url:
|
| 272 |
+
config.EMBEDDING_BASE_URL = embedding_base_url
|
| 273 |
|
| 274 |
if llm_model:
|
| 275 |
config.LLM_MODEL = llm_model
|
|
|
|
| 277 |
if embedding_model:
|
| 278 |
config.EMBEDDING_MODEL = embedding_model
|
| 279 |
|
| 280 |
+
# CHANGED: Update logging
|
| 281 |
logger.info(f"Initializing RAG system with:")
|
| 282 |
+
logger.info(f" - LLM Base URL: {config.LLM_BASE_URL}")
|
| 283 |
+
logger.info(f" - Embedding Base URL: {config.EMBEDDING_BASE_URL}")
|
| 284 |
logger.info(f" - LLM Model: {config.LLM_MODEL}")
|
| 285 |
logger.info(f" - Embedding Model: {config.EMBEDDING_MODEL}")
|
| 286 |
|
|
|
|
| 302 |
chunks = text_splitter.split_documents(documents)
|
| 303 |
logger.info(f"Document split into {len(chunks)} chunks")
|
| 304 |
|
|
|
|
| 305 |
if len(chunks) > 200:
|
| 306 |
logger.warning(f"Large number of chunks ({len(chunks)}). Consider increasing chunk_size to reduce API calls.")
|
| 307 |
|
| 308 |
+
# CHANGED: Initialize OpenAI embeddings with its specific base URL
|
| 309 |
embeddings = OpenAIEmbeddings(
|
| 310 |
model=config.EMBEDDING_MODEL,
|
| 311 |
openai_api_key=config.OPENAI_API_KEY,
|
| 312 |
+
openai_api_base=config.EMBEDDING_BASE_URL,
|
| 313 |
+
chunk_size=1000
|
| 314 |
)
|
| 315 |
|
| 316 |
# Test embedding connection
|
|
|
|
| 343 |
vector_store.save_local(config.INDEX_PATH)
|
| 344 |
logger.info(f"Created new FAISS index at '{config.INDEX_PATH}'")
|
| 345 |
|
| 346 |
+
# CHANGED: Initialize OpenAI LLM with its specific base URL
|
| 347 |
llm = ChatOpenAI(
|
| 348 |
model_name=config.LLM_MODEL,
|
| 349 |
openai_api_key=config.OPENAI_API_KEY,
|
| 350 |
+
openai_api_base=config.LLM_BASE_URL,
|
| 351 |
temperature=config.TEMPERATURE,
|
| 352 |
max_tokens=config.MAX_OUTPUT_TOKENS,
|
| 353 |
callbacks=[token_callback_handler],
|
|
|
|
| 356 |
|
| 357 |
# Test LLM connection
|
| 358 |
try:
|
| 359 |
+
# Note: The os.environ is not strictly needed if passing params directly,
|
| 360 |
+
# but setting it can be a good practice for other potential library uses.
|
| 361 |
+
# We'll rely on direct parameter passing which is cleaner.
|
| 362 |
test_response = llm.invoke("Test connection")
|
| 363 |
logger.info("Successfully connected to LLM API")
|
| 364 |
except Exception as e:
|
|
|
|
| 410 |
"""Initialize the system on startup if API key is available."""
|
| 411 |
if config.OPENAI_API_KEY:
|
| 412 |
try:
|
| 413 |
+
# This will use the URLs from environment variables by default
|
| 414 |
await initialize_rag_system()
|
| 415 |
except Exception as e:
|
| 416 |
logger.warning(f"Could not initialize on startup: {str(e)}")
|
|
|
|
| 445 |
is_initialized=is_initialized,
|
| 446 |
model_name=config.LLM_MODEL,
|
| 447 |
embedding_model=config.EMBEDDING_MODEL,
|
| 448 |
+
# CHANGED: Return separate URLs
|
| 449 |
+
llm_base_url=config.LLM_BASE_URL,
|
| 450 |
+
embedding_base_url=config.EMBEDDING_BASE_URL,
|
| 451 |
vector_store_ready=vector_store is not None,
|
| 452 |
total_chunks=len(vector_store.docstore._dict) if vector_store else 0,
|
| 453 |
api_key_configured=bool(config.OPENAI_API_KEY)
|
|
|
|
| 457 |
async def initialize_system(request: InitializeRequest):
|
| 458 |
"""Initialize the RAG system with provided API key and configuration."""
|
| 459 |
try:
|
| 460 |
+
# CHANGED: Pass separate URLs to the initialization function
|
| 461 |
await initialize_rag_system(
|
| 462 |
api_key=request.api_key,
|
| 463 |
+
llm_base_url=request.llm_base_url,
|
| 464 |
+
embedding_base_url=request.embedding_base_url,
|
| 465 |
llm_model=request.llm_model,
|
| 466 |
embedding_model=request.embedding_model
|
| 467 |
)
|
| 468 |
+
# CHANGED: Return separate URLs in the response
|
| 469 |
return {
|
| 470 |
"success": True,
|
| 471 |
"message": "System initialized successfully",
|
| 472 |
"config": {
|
| 473 |
+
"llm_base_url": config.LLM_BASE_URL,
|
| 474 |
+
"embedding_base_url": config.EMBEDDING_BASE_URL,
|
| 475 |
"llm_model": config.LLM_MODEL,
|
| 476 |
"embedding_model": config.EMBEDDING_MODEL
|
| 477 |
}
|
|
|
|
| 539 |
logger.error(f"Error processing query: {str(e)}")
|
| 540 |
raise HTTPException(status_code=500, detail=str(e))
|
| 541 |
|
| 542 |
+
# (No changes needed in the remaining endpoints)
|
| 543 |
+
|
| 544 |
@app.get("/api/token-stats", response_model=Dict[str, Any])
|
| 545 |
async def get_token_stats():
|
| 546 |
"""Get token usage statistics."""
|
|
|
|
| 599 |
@app.get("/api/config")
|
| 600 |
async def get_config():
|
| 601 |
"""Get current configuration."""
|
| 602 |
+
# CHANGED: Return separate URLs
|
| 603 |
return {
|
| 604 |
+
"llm_base_url": config.LLM_BASE_URL,
|
| 605 |
+
"embedding_base_url": config.EMBEDDING_BASE_URL,
|
| 606 |
"llm_model": config.LLM_MODEL,
|
| 607 |
"embedding_model": config.EMBEDDING_MODEL,
|
| 608 |
"chunk_size": config.CHUNK_SIZE,
|