Spaces:

dinukpathiraja
/

boqapi

Sleeping

App Files Files Community

Dinuk-Di commited on Mar 24

Commit

a122f91

1 Parent(s): 735e421

Chat Api

Browse files

Files changed (8) hide show

.gitattributes +2 -0
Dockerfile +10 -20
app/main.py +26 -63
app/model.py +38 -120
app/routes.py +17 -124
app/schema.py +9 -107
app/services.py +0 -247
requirements.txt +7 -13

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.env filter=lfs diff=lfs merge=lfs -text
+*.env.* filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -3,23 +3,18 @@ FROM python:3.11-slim
 ENV DEBIAN_FRONTEND=noninteractive \
     PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
-    HF_HOME=/app/.cache/huggingface \
-    TRANSFORMERS_CACHE=/app/.cache/huggingface \
     PORT=7860
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    ffmpeg libsndfile1 git curl \
-    && apt-get clean && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
-# Upgrade build tools first
-RUN pip install --no-cache-dir --upgrade pip setuptools wheel packaging hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
-# Default PyTorch with CUDA support (crucial for A10G inference to avoid CPU OOM)
-RUN pip install --no-cache-dir torch torchvision torchaudio
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
@@ -30,13 +25,8 @@ USER appuser
 EXPOSE 7860
-HEALTHCHECK --interval=60s --timeout=15s --start-period=300s --retries=3 \
-    CMD curl -f http://localhost:7860/ || exit 1
-CMD ["python", "-m", "uvicorn", "main:app", \
-     "--app-dir", "app", \
-     "--host", "0.0.0.0", \
-     "--port", "7860", \
-     "--workers", "1", \
-     "--loop", "uvloop", \
-     "--log-level", "info"]

 ENV DEBIAN_FRONTEND=noninteractive \
     PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     PORT=7860
 WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+# Upgrade build tools
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+# Install dependencies (no heavy CUDA packages needed for LangChain HF Endpoint!)
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 EXPOSE 7860
+HEALTHCHECK --interval=60s --timeout=15s --start-period=30s --retries=3 \
+    CMD curl -f http://localhost:7860/api/health || exit 1
+# Start uvicorn, ensuring sys.path includes the root so `app.` imports work
+CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

app/main.py CHANGED Viewed

@@ -1,60 +1,41 @@
-# main.py
-import logging
-import time
-import os
-from contextlib import asynccontextmanager
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
-from slowapi import Limiter, _rate_limit_exceeded_handler
-from slowapi.util import get_remote_address
-from slowapi.errors import RateLimitExceeded
-import uvicorn
-from model import load_model
-from routes import router_rag, router_ingest, router_monitor, router_health
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
-)
 logger = logging.getLogger(__name__)
-limiter = Limiter(key_func=get_remote_address)
-ENABLE_AUDIO = os.getenv("ENABLE_AUDIO_OUTPUT", "false").lower() == "true"
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     print("\n" + "="*60, flush=True)
-    print("🚀 INITIALIZING APP: Downloading and loading Qwen2.5-Omni-3B", flush=True)
-    print("⏳ This model is ~15GB and will take a few minutes to load.", flush=True)
     print("="*60 + "\n", flush=True)
-    logger.info("Starting up — loading Qwen2.5-Omni-3B model...")
-    load_model(enable_audio_output=ENABLE_AUDIO)
-    logger.info("Model ready. API is live.")
     print("\n✅ API is LIVE on port 7860! Ready for requests.\n", flush=True)
     yield
-    logger.info("Shutting down API.")
 app = FastAPI(
-    title="Multimodal RAG API",
-    description=(
-        "Production-ready RAG API powered by Qwen2.5-Omni-3B. "
-        "Supports text, image, audio, and video modalities."
-    ),
     version="1.0.0",
     lifespan=lifespan,
-    docs_url="/docs",
-    redoc_url="/redoc",
 )
-# ── Middleware ────────────────────────────────────────────────────────────────
 app.add_middleware(
     CORSMiddleware,
     allow_origins=os.getenv("CORS_ORIGINS", "*").split(","),
@@ -63,44 +44,26 @@ app.add_middleware(
     allow_headers=["*"],
 )
-app.state.limiter = limiter
-app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
 @app.middleware("http")
 async def request_logging_middleware(request: Request, call_next):
     start = time.time()
     response = await call_next(request)
     duration_ms = (time.time() - start) * 1000
-    logger.info(
-        f"{request.method} {request.url.path} "
-        f"→ {response.status_code} [{duration_ms:.1f}ms]"
-    )
     return response
-@app.exception_handler(Exception)
-async def global_exception_handler(request: Request, exc: Exception):
-    logger.exception(f"Unhandled exception: {exc}")
-    return JSONResponse(
-        status_code=500,
-        content={"error": "Internal server error", "detail": str(exc)},
-    )
-# ── Routers ───────────────────────────────────────────────────────────────────
-app.include_router(router_health)
-app.include_router(router_rag)
-app.include_router(router_ingest)
-app.include_router(router_monitor)
 if __name__ == "__main__":
     uvicorn.run(
-        "main:app",
         host="0.0.0.0",
         port=7860,
-        workers=1,          # Single worker for GPU models
-        loop="uvloop",
         log_level="info",
     )

 from fastapi import FastAPI, Request
+from contextlib import asynccontextmanager
 from fastapi.middleware.cors import CORSMiddleware
+from app.routes import router as api_router
+from app.model import load_model
+from dotenv import load_dotenv
+import os
+import time
+import logging
+load_dotenv()
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+REPO_ID = os.getenv("MODEL_REPO_ID", "deepseek-ai/DeepSeek-R1")
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     print("\n" + "="*60, flush=True)
+    print(f"🚀 INITIALIZING CHAT API: Setup remote LLM ({REPO_ID})", flush=True)
     print("="*60 + "\n", flush=True)
+    # Store the LLM in the application state so routes can access it
+    app.state.llm = load_model(repo_id=REPO_ID)
     print("\n✅ API is LIVE on port 7860! Ready for requests.\n", flush=True)
     yield
+    print("\n" + "="*60, flush=True)
+    print("👋 Shutting down API. Goodbye!", flush=True)
 app = FastAPI(
+    title="Multimodal Chat API",
+    description="Production-ready Chat API powered by LangChain and HuggingFaceEndpoint.",
     version="1.0.0",
     lifespan=lifespan,
 )
 app.add_middleware(
     CORSMiddleware,
     allow_origins=os.getenv("CORS_ORIGINS", "*").split(","),
     allow_headers=["*"],
 )
 @app.middleware("http")
 async def request_logging_middleware(request: Request, call_next):
     start = time.time()
     response = await call_next(request)
     duration_ms = (time.time() - start) * 1000
+    if request.url.path != "/api/health":
+        logger.info(
+            f"{request.method} {request.url.path} "
+            f"→ {response.status_code} [{duration_ms:.1f}ms]"
+        )
     return response
+app.include_router(api_router, prefix="/api")
 if __name__ == "__main__":
+    import uvicorn
     uvicorn.run(
+        "app.main:app",
         host="0.0.0.0",
         port=7860,
+        workers=1,
         log_level="info",
     )

app/model.py CHANGED Viewed

@@ -1,121 +1,39 @@
-# model.py
-import torch
-import logging
-import time
-from typing import Optional, Tuple, List, Dict, Any
-from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
-from qwen_omni_utils import process_mm_info
-logger = logging.getLogger(__name__)
-MODEL_ID = "Qwen/Qwen2.5-Omni-3B"
-_model: Optional[Qwen2_5OmniForConditionalGeneration] = None
-_processor: Optional[Qwen2_5OmniProcessor] = None
-_model_load_time: float = 0.0
-def load_model(enable_audio_output: bool = False):
-    global _model, _processor, _model_load_time
-    if _model is not None and _processor is not None:
-        return _model, _processor
-    logger.info(f"Loading model: {MODEL_ID}")
-    start = time.time()
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    logger.info(f"Using device: {device}")
-    load_kwargs: Dict[str, Any] = {
-        # Use float32 on CPU — bfloat16 is poorly supported on CPU
-        "torch_dtype": torch.bfloat16 if device == "cuda" else torch.float32,
-        "device_map": "auto" if device == "cuda" else "cpu",
-        # NO flash_attention_2 — only works with GPU + nvcc
-    }
-    _model = Qwen2_5OmniForConditionalGeneration.from_pretrained(MODEL_ID, **load_kwargs)
-    # Always disable talker on CPU — saves ~2GB and talker requires GPU
-    _model.disable_talker()
-    logger.info("Audio talker disabled (CPU mode — saves memory).")
-    _processor = Qwen2_5OmniProcessor.from_pretrained(MODEL_ID)
-    _model_load_time = time.time() - start
-    logger.info(f"Model loaded in {_model_load_time:.2f}s on {device}")
-    return _model, _processor
-def get_model() -> Qwen2_5OmniForConditionalGeneration:
-    if _model is None:
-        raise RuntimeError("Model not loaded. Call load_model() first.")
-    return _model
-def get_processor() -> Qwen2_5OmniProcessor:
-    if _processor is None:
-        raise RuntimeError("Processor not loaded. Call load_model() first.")
-    return _processor
-def run_inference(
-    conversation: List[Dict],
-    return_audio: bool = False,
-    speaker: str = "Chelsie",
-    max_new_tokens: int = 256,
-    temperature: float = 0.7,
-    use_audio_in_video: bool = True,
-) -> Tuple[str, Optional[bytes], int, int]:
-    model = get_model()
-    processor = get_processor()
-    # Force return_audio=False on CPU since talker is disabled
-    if not torch.cuda.is_available():
-        return_audio = False
-    text_template = processor.apply_chat_template(
-        conversation,
-        add_generation_prompt=True,
-        tokenize=False,
     )
-    audios, images, videos = process_mm_info(
-        conversation, use_audio_in_video=use_audio_in_video
-    )
-    inputs = processor(
-        text=text_template,
-        audio=audios,
-        images=images,
-        videos=videos,
-        return_tensors="pt",
-        padding=True,
-        use_audio_in_video=use_audio_in_video,
-    ).to(model.device)
-    # Match dtype for CPU (float32)
-    if not torch.cuda.is_available():
-        inputs = {k: v.float() if v.dtype == torch.float16 else v
-                  for k, v in inputs.items()}
-    prompt_tokens = inputs["input_ids"].shape[-1]
-    generate_kwargs: Dict[str, Any] = {
-        "use_audio_in_video": use_audio_in_video,
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "do_sample": temperature > 0,
-        "return_audio": False,  # Always False — talker disabled on CPU
-    }
-    with torch.inference_mode():
-        outputs = model.generate(**inputs, **generate_kwargs)
-    completion_tokens = outputs.shape[-1] - prompt_tokens
-    decoded = processor.batch_decode(
-        outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-    answer = decoded[0] if decoded else ""
-    return answer, None, prompt_tokens, completion_tokens

+from langchain_core.prompts import PromptTemplate
+from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
+import os
+from app.schema import OutputResponse
+# We completely remove `getpass` to prevent blocking the Docker container.
+# HuggingFace secrets should be defined in the HF space environment automatically.
+if not os.getenv("HUGGINGFACEHUB_API_TOKEN"):
+    print("WARNING: HUGGINGFACEHUB_API_TOKEN is not set in the environment. "
+          "Set this as a secret in your HuggingFace Space or .env file.")
+def load_model(repo_id: str, max_length: int = 512, temperature: float = 0.5):
+    llm = HuggingFaceEndpoint(
+        repo_id=repo_id,
+        task="text-generation",
+        max_new_tokens=max_length,
+        do_sample=temperature > 0,
+        temperature=temperature if temperature > 0 else None,
     )
+    return llm
+def generate_answer(question: str, llm) -> OutputResponse:
+    try:
+        prompt = PromptTemplate(
+            input_variables=["question"],
+            template="""
+You are a helpful assistant that provides concise and accurate answers to user questions.
+Question: {question}
+Answer Format:
+Answer: <Your concise answer here>
+Justification: <Why this query is relevant to the user's request>
+"""
+        )
+        chat_model = ChatHuggingFace(llm=llm)
+        structured_llm = chat_model.with_structured_output(OutputResponse)
+        result = structured_llm.invoke(prompt.format(question=question))
+        return result
+    except Exception as e:
+        return OutputResponse(answer="Error generating answer", justification=str(e))

app/routes.py CHANGED Viewed

@@ -1,131 +1,24 @@
-# routes.py
-import logging
-import time
-from typing import Optional
-from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks, Header
-from fastapi.responses import JSONResponse
-from schema import (
-    RAGQueryRequest, RAGQueryResponse,
-    IngestRequest, IngestResponse,
-    HealthResponse, ErrorResponse, UserStatsResponse,
-)
-from services import (
-    process_rag_query, ingest_documents,
-    get_global_stats, get_user_stats,
-)
-from model import get_model, get_processor
-logger = logging.getLogger(__name__)
-router_rag = APIRouter(prefix="/rag", tags=["RAG"])
-router_ingest = APIRouter(prefix="/ingest", tags=["Ingestion"])
-router_monitor = APIRouter(prefix="/monitor", tags=["Monitoring"])
-router_health = APIRouter(tags=["Health"])
-_start_time = time.time()
-# ── Auth dependency (replace with JWT/OAuth in production) ───────────────────
-async def verify_api_key(x_api_key: Optional[str] = Header(default=None)):
-    import os
-    expected = os.getenv("API_KEY", "dev-secret")
-    if x_api_key != expected:
-        raise HTTPException(status_code=401, detail="Invalid or missing API key.")
-    return x_api_key
-# ── RAG Routes ────────────────────────────────────────────────────────────────
-@router_rag.post(
-    "/query",
-    response_model=RAGQueryResponse,
-    responses={400: {"model": ErrorResponse}, 500: {"model": ErrorResponse}},
-    summary="Multimodal RAG Query",
-    description="Submit text, image, audio, or video inputs to query the RAG pipeline.",
-)
-async def rag_query(
-    request: RAGQueryRequest,
-    background_tasks: BackgroundTasks,
-    _: str = Depends(verify_api_key),
-):
     try:
-        response = await process_rag_query(request)
-        return response
-    except ValueError as e:
-        raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
-        logger.exception(f"RAG query failed: {e}")
-        raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")
-# ── Ingestion Routes ──────────────────────────────────────────────────────────
-@router_ingest.post(
-    "/documents",
-    response_model=IngestResponse,
-    summary="Ingest Multimodal Documents",
-)
-async def ingest_docs(
-    request: IngestRequest,
-    _: str = Depends(verify_api_key),
-):
     try:
-        doc_ids = ingest_documents(
-            user_id=request.user_id,
-            documents=request.documents,
-            doc_ids=request.doc_ids,
-        )
-        return IngestResponse(
-            ingested_count=len(doc_ids),
-            doc_ids=doc_ids,
-            message=f"Successfully ingested {len(doc_ids)} documents.",
-        )
     except Exception as e:
-        logger.exception(f"Ingestion failed: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-# ── Monitoring Routes ─────────────────────────────────────────────────────────
-@router_monitor.get(
-    "/health",
-    response_model=HealthResponse,
-    summary="API Health Check",
-)
-async def health_check():
-    try:
-        model = get_model()
-        device = str(next(model.parameters()).device)
-        model_loaded = True
-    except RuntimeError:
-        device = "unavailable"
-        model_loaded = False
-    stats = get_global_stats()
-    return HealthResponse(
-        status="ok" if model_loaded else "degraded",
-        model_loaded=model_loaded,
-        device=device,
-        uptime_seconds=time.time() - _start_time,
-        total_requests=stats["total_requests"],
-        total_tokens_processed=stats["total_tokens"],
-    )
-@router_monitor.get(
-    "/users/{user_id}",
-    response_model=UserStatsResponse,
-    summary="Get Per-User Usage Stats",
-)
-async def user_stats(
-    user_id: str,
-    _: str = Depends(verify_api_key),
-):
-    stats = get_user_stats(user_id)
-    if stats is None:
-        raise HTTPException(status_code=404, detail=f"User '{user_id}' not found.")
-    return UserStatsResponse(**stats)
-@router_health.get("/", include_in_schema=False)
-async def root():
-    return {"message": "Multimodal RAG API is running. Visit /docs for API reference."}

+from fastapi import APIRouter, Request
+from app.model import generate_answer
+from app.schema import UserRequest
+router = APIRouter()
+@router.get("/health", tags=["Health"])
+async def health_check():
     try:
+        return {"status": "ok"}
     except Exception as e:
+        return {"status": "error", "message": str(e)}
+@router.post("/chat", tags=["Chat"])
+async def chat_endpoint(request_body: UserRequest, request: Request):
     try:
+        llm = request.app.state.llm
+        if not llm:
+            return {"status": "error", "message": "LLM not loaded into application state."}
+        response = generate_answer(request_body.question, llm)
+        return response
     except Exception as e:
+        return {"status": "error", "message": str(e)}

app/schema.py CHANGED Viewed

@@ -1,107 +1,9 @@
-# schema.py
-from pydantic import BaseModel, Field, validator
-from typing import Optional, List, Literal, Any, Dict
-from enum import Enum
-import uuid
-from datetime import datetime
-class ModalityType(str, Enum):
-    TEXT = "text"
-    IMAGE = "image"
-    AUDIO = "audio"
-    VIDEO = "video"
-class MediaInput(BaseModel):
-    modality: ModalityType
-    content: str = Field(..., description="URL, base64 string, or raw text depending on modality")
-    use_audio_in_video: Optional[bool] = Field(default=True, description="Use embedded audio in video")
-class RAGQueryRequest(BaseModel):
-    query_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
-    user_id: str = Field(..., description="Unique user identifier")
-    query_text: Optional[str] = Field(default=None, description="Natural language query")
-    media_inputs: Optional[List[MediaInput]] = Field(default=[], description="List of multimodal inputs")
-    top_k: int = Field(default=5, ge=1, le=20, description="Number of RAG context chunks to retrieve")
-    return_audio: bool = Field(
-    default=False,
-    description="Audio output (GPU only — disabled on CPU deployments)")
-    speaker: Literal["Chelsie", "Ethan"] = Field(default="Chelsie")
-    max_new_tokens: int = Field(default=256, ge=32, le=512)
-    temperature: float = Field(default=0.7, ge=0.0, le=2.0)
-    @validator("media_inputs", always=True)
-    def validate_at_least_one_input(cls, v, values):
-        if not v and not values.get("query_text"):
-            raise ValueError("At least one of query_text or media_inputs must be provided.")
-        return v
-class RAGDocument(BaseModel):
-    doc_id: str
-    content: str
-    modality: ModalityType
-    score: float = Field(..., ge=0.0, le=1.0)
-    metadata: Optional[Dict[str, Any]] = {}
-class TokenUsage(BaseModel):
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
-class PerformanceMetrics(BaseModel):
-    latency_ms: float
-    retrieval_latency_ms: float
-    generation_latency_ms: float
-    throughput_tokens_per_sec: float
-class RAGQueryResponse(BaseModel):
-    query_id: str
-    user_id: str
-    answer_text: str
-    retrieved_documents: List[RAGDocument]
-    audio_base64: Optional[str] = None
-    token_usage: TokenUsage
-    performance: PerformanceMetrics
-    timestamp: datetime = Field(default_factory=datetime.utcnow)
-class IngestRequest(BaseModel):
-    user_id: str
-    documents: List[MediaInput]
-    doc_ids: Optional[List[str]] = None
-class IngestResponse(BaseModel):
-    ingested_count: int
-    doc_ids: List[str]
-    message: str
-class HealthResponse(BaseModel):
-    status: str
-    model_loaded: bool
-    device: str
-    uptime_seconds: float
-    total_requests: int
-    total_tokens_processed: int
-class ErrorResponse(BaseModel):
-    error: str
-    detail: Optional[str] = None
-    query_id: Optional[str] = None
-    timestamp: datetime = Field(default_factory=datetime.utcnow)
-class UserStatsResponse(BaseModel):
-    user_id: str
-    total_queries: int
-    total_tokens: int
-    avg_latency_ms: float
-    last_active: Optional[datetime] = None

+from pydantic import BaseModel, Field
+class UserRequest(BaseModel):
+    question: str = Field(..., description="The user's question or request.")
+class OutputResponse(BaseModel):
+    answer: str = Field(..., description="The answer generated by the model.")
+    justification: str = Field(
+        ..., description="Why this query is relevant to the user's request."
+    )

app/services.py DELETED Viewed

@@ -1,247 +0,0 @@
-# services.py
-import uuid
-import time
-import base64
-import logging
-import asyncio
-from datetime import datetime
-from collections import defaultdict
-from typing import List, Dict, Optional, Tuple, Any
-import numpy as np
-from sentence_transformers import SentenceTransformer
-from sklearn.metrics.pairwise import cosine_similarity
-from schema import (
-    MediaInput, RAGDocument, TokenUsage, PerformanceMetrics,
-    RAGQueryRequest, RAGQueryResponse, ModalityType
-)
-from model import run_inference
-logger = logging.getLogger(__name__)
-# ── In-memory vector store (swap for FAISS/Qdrant/Chroma in production) ──────
-_doc_store: Dict[str, Dict[str, Any]] = {}
-_embeddings_store: Dict[str, np.ndarray] = {}
-_embed_model: Optional[SentenceTransformer] = None
-# ── Monitoring state ──────────────────────────────────────────────────────────
-_global_stats = {
-    "total_requests": 0,
-    "total_tokens": 0,
-    "start_time": time.time(),
-}
-_user_stats: Dict[str, Dict] = defaultdict(lambda: {
-    "total_queries": 0,
-    "total_tokens": 0,
-    "latencies": [],
-    "last_active": None,
-})
-def get_embed_model() -> SentenceTransformer:
-    global _embed_model
-    if _embed_model is None:
-        _embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-        logger.info("Embedding model loaded.")
-    return _embed_model
-def _embed_text(text: str) -> np.ndarray:
-    model = get_embed_model()
-    return model.encode([text], normalize_embeddings=True)[0]
-def ingest_documents(
-    user_id: str,
-    documents: List[MediaInput],
-    doc_ids: Optional[List[str]] = None,
-) -> List[str]:
-    ids = []
-    for i, doc in enumerate(documents):
-        doc_id = doc_ids[i] if doc_ids and i < len(doc_ids) else str(uuid.uuid4())
-        # For non-text modalities, embed a descriptor; full multimodal embeddings
-        # require a separate vision-language embedding model (e.g., CLIP, ImageBind).
-        if doc.modality == ModalityType.TEXT:
-            embed_text = doc.content
-        else:
-            embed_text = f"[{doc.modality.value.upper()} resource] {doc.content}"
-        embedding = _embed_text(embed_text)
-        _doc_store[doc_id] = {
-            "doc_id": doc_id,
-            "user_id": user_id,
-            "content": doc.content,
-            "modality": doc.modality,
-            "metadata": {"ingested_at": datetime.utcnow().isoformat()},
-        }
-        _embeddings_store[doc_id] = embedding
-        ids.append(doc_id)
-    logger.info(f"Ingested {len(ids)} documents for user {user_id}")
-    return ids
-def retrieve_documents(
-    query_text: str,
-    top_k: int = 5,
-) -> List[RAGDocument]:
-    if not _embeddings_store:
-        return []
-    query_emb = _embed_text(query_text).reshape(1, -1)
-    doc_ids = list(_embeddings_store.keys())
-    doc_embs = np.vstack([_embeddings_store[d] for d in doc_ids])
-    scores = cosine_similarity(query_emb, doc_embs)[0]
-    top_indices = np.argsort(scores)[::-1][:top_k]
-    results = []
-    for idx in top_indices:
-        did = doc_ids[idx]
-        doc = _doc_store[did]
-        results.append(RAGDocument(
-            doc_id=did,
-            content=doc["content"],
-            modality=doc["modality"],
-            score=float(scores[idx]),
-            metadata=doc.get("metadata", {}),
-        ))
-    return results
-def _build_rag_conversation(
-    request: RAGQueryRequest,
-    retrieved_docs: List[RAGDocument],
-) -> List[Dict]:
-    system_prompt = (
-        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
-        "capable of perceiving auditory and visual inputs, as well as generating text and speech."
-        if request.return_audio
-        else "You are a helpful multimodal AI assistant with access to retrieved context."
-    )
-    context_str = "\n\n".join(
-        [f"[Context {i+1} | {d.modality.value}]: {d.content}" for i, d in enumerate(retrieved_docs)]
-    )
-    user_content: List[Dict] = []
-    for media in (request.media_inputs or []):
-        if media.modality == ModalityType.TEXT:
-            user_content.append({"type": "text", "text": media.content})
-        elif media.modality == ModalityType.IMAGE:
-            user_content.append({"type": "image", "image": media.content})
-        elif media.modality == ModalityType.AUDIO:
-            user_content.append({"type": "audio", "audio": media.content})
-        elif media.modality == ModalityType.VIDEO:
-            user_content.append({"type": "video", "video": media.content})
-    final_query = (
-        f"Retrieved Context:\n{context_str}\n\n"
-        f"User Query: {request.query_text or 'Analyze the provided media.'}"
-    )
-    user_content.append({"type": "text", "text": final_query})
-    return [
-        {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
-        {"role": "user", "content": user_content},
-    ]
-def _update_monitoring(
-    user_id: str,
-    total_tokens: int,
-    latency_ms: float,
-):
-    _global_stats["total_requests"] += 1
-    _global_stats["total_tokens"] += total_tokens
-    user = _user_stats[user_id]
-    user["total_queries"] += 1
-    user["total_tokens"] += total_tokens
-    user["latencies"].append(latency_ms)
-    # Keep only last 1000 latencies per user to avoid unbounded memory
-    if len(user["latencies"]) > 1000:
-        user["latencies"] = user["latencies"][-1000:]
-    user["last_active"] = datetime.utcnow()
-async def process_rag_query(request: RAGQueryRequest) -> RAGQueryResponse:
-    total_start = time.time()
-    # ── Retrieval ─────────────────────────────────────────────────────────────
-    retrieval_start = time.time()
-    query_for_retrieval = request.query_text or " ".join(
-        m.content for m in (request.media_inputs or []) if m.modality == ModalityType.TEXT
-    ) or "multimodal query"
-    retrieved_docs = retrieve_documents(query_for_retrieval, top_k=request.top_k)
-    retrieval_latency_ms = (time.time() - retrieval_start) * 1000
-    # ── Build conversation ────────────────────────────────────────────────────
-    conversation = _build_rag_conversation(request, retrieved_docs)
-    use_audio_in_video = any(
-        m.use_audio_in_video for m in (request.media_inputs or [])
-        if m.modality == ModalityType.VIDEO
-    )
-    # ── Generation (run in thread pool to avoid blocking event loop) ──────────
-    gen_start = time.time()
-    loop = asyncio.get_event_loop()
-    answer, audio_bytes, prompt_tokens, completion_tokens = await loop.run_in_executor(
-        None,
-        lambda: run_inference(
-            conversation=conversation,
-            return_audio=request.return_audio,
-            speaker=request.speaker,
-            max_new_tokens=request.max_new_tokens,
-            temperature=request.temperature,
-            use_audio_in_video=use_audio_in_video,
-        ),
-    )
-    gen_latency_ms = (time.time() - gen_start) * 1000
-    total_latency_ms = (time.time() - total_start) * 1000
-    total_tokens = prompt_tokens + completion_tokens
-    throughput = (completion_tokens / (gen_latency_ms / 1000)) if gen_latency_ms > 0 else 0
-    _update_monitoring(request.user_id, total_tokens, total_latency_ms)
-    audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") if audio_bytes else None
-    return RAGQueryResponse(
-        query_id=request.query_id,
-        user_id=request.user_id,
-        answer_text=answer,
-        retrieved_documents=retrieved_docs,
-        audio_base64=audio_b64,
-        token_usage=TokenUsage(
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            total_tokens=total_tokens,
-        ),
-        performance=PerformanceMetrics(
-            latency_ms=total_latency_ms,
-            retrieval_latency_ms=retrieval_latency_ms,
-            generation_latency_ms=gen_latency_ms,
-            throughput_tokens_per_sec=throughput,
-        ),
-    )
-def get_global_stats() -> Dict:
-    return _global_stats.copy()
-def get_user_stats(user_id: str) -> Optional[Dict]:
-    if user_id not in _user_stats:
-        return None
-    u = _user_stats[user_id]
-    return {
-        "user_id": user_id,
-        "total_queries": u["total_queries"],
-        "total_tokens": u["total_tokens"],
-        "avg_latency_ms": float(np.mean(u["latencies"])) if u["latencies"] else 0.0,
-        "last_active": u["last_active"],
-    }

requirements.txt CHANGED Viewed

@@ -1,13 +1,7 @@
-fastapi>=0.111.0
-uvicorn[standard]>=0.29.0
-uvloop>=0.19.0
-pydantic>=2.7.0
-transformers @ git+https://github.com/huggingface/transformers@v4.51.3-Qwen2.5-Omni-preview
-accelerate>=0.30.0
-qwen-omni-utils[decord]
-sentence-transformers>=3.0.0
-scikit-learn>=1.4.0
-soundfile>=0.12.1
-numpy>=1.26.0
-slowapi>=0.1.9
-# flash-attn REMOVED — requires nvcc/GPU to compile

+fastapi[standard]
+langchain_core
+langgraph
+huggingface_hub
+langchain
+langchain-huggingface
+pydantic