Spaces:

Pujan-Dev
/

per_new

Sleeping

App Files Files Community

Pujan-Dev commited on 17 days ago

Commit

128b0a8

verified ·

1 Parent(s): 12e4d25

Upload 8 files

Browse files

Files changed (6) hide show

.gitignore +1 -0
Dockerfile +39 -0
config.py +93 -0
main.py +37 -0
rag_service.py +248 -0
schemas.py +26 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

Dockerfile ADDED Viewed

	@@ -0,0 +1,39 @@

+FROM python:3.11-slim
+# Keep Python output unbuffered and avoid .pyc files in containers.
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PIP_NO_CACHE_DIR=1
+# Optional Hugging Face cache location inside the container.
+ENV HF_HOME=/app/.cache/huggingface
+ENV TRANSFORMERS_CACHE=/app/.cache/huggingface
+WORKDIR /app
+# System libs often needed by ML wheels/runtime.
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  git \
+  build-essential \
+  && rm -rf /var/lib/apt/lists/*
+# Install Python dependencies used by Fastapi/main.py.
+RUN pip install --upgrade pip && pip install \
+  fastapi \
+  "uvicorn[standard]" \
+  numpy \
+  faiss-cpu \
+  torch \
+  transformers \
+  sentencepiece \
+  InstructorEmbedding \
+  langchain-core
+# Copy the whole repo so Fastapi app can resolve vector_db.index/chunks.pkl
+# from /app, /app/Fastapi, or /app/RAG_pipeline.
+COPY . /app
+EXPOSE 8000
+# Run FastAPI app.
+CMD ["uvicorn", "Fastapi.main:app", "--host", "0.0.0.0", "--port", "8000"]

config.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from dataclasses import dataclass, field
+from pathlib import Path
+import os
+def _load_dotenv(dotenv_path: Path) -> None:
+    if not dotenv_path.exists():
+        return
+    for raw_line in dotenv_path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        key, value = line.split("=", 1)
+        key = key.strip()
+        value = value.strip().strip('"').strip("'")
+        os.environ.setdefault(key, value)
+def _get_env(name: str, default: str, aliases: tuple[str, ...] = ()) -> str:
+    for key in (name, *aliases):
+        value = os.getenv(key)
+        if value is not None and value != "":
+            return value
+    return default
+def _to_int(value: str, default: int) -> int:
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return default
+def _to_float(value: str, default: float) -> float:
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return default
+_BASE_DIR = Path(__file__).resolve().parent
+_load_dotenv(_BASE_DIR / ".env")
+@dataclass
+class Settings:
+    app_title: str = _get_env("APP_TITLE", "RAG API")
+    model_id: str = _get_env("MODEL_ID", "Qwen/Qwen2.5-1.5B-Instruct", aliases=("MODEL_NAME",))
+    embedding_model_id: str = _get_env(
+        "EMBEDDING_MODEL_ID",
+        "hkunlp/instructor-base",
+        aliases=("EMBEDDING_MODEL",),
+    )
+    models_dir: str = _get_env("MODELS_DIR", "Models")
+    vector_db_file: str = _get_env("VECTOR_DB_FILE", "vector_db.index", aliases=("VECTOR_STORE_PATH",))
+    chunks_file: str = _get_env("CHUNKS_FILE", "chunks.pkl")
+    retrieval_instruction: str = _get_env(
+        "RETRIEVAL_INSTRUCTION",
+        "Represent the question for retrieving relevant documents",
+    )
+    max_context_tokens: int = _to_int(_get_env("MAX_CONTEXT_TOKENS", "3072"), 3072)
+    max_new_tokens: int = _to_int(_get_env("MAX_NEW_TOKENS", "500"), 500)
+    temperature: float = _to_float(_get_env("TEMPERATURE", "0.3"), 0.3)
+    repetition_penalty: float = _to_float(_get_env("REPETITION_PENALTY", "1.3"), 1.3)
+    default_top_k: int = _to_int(_get_env("DEFAULT_TOP_K", "3"), 3)
+    min_top_k: int = _to_int(_get_env("MIN_TOP_K", "1"), 1)
+    max_top_k: int = _to_int(_get_env("MAX_TOP_K", "10"), 10)
+    host: str = _get_env("HOST", "0.0.0.0", aliases=("API_HOST",))
+    port: int = _to_int(_get_env("PORT", "8000", aliases=("API_PORT",)), 8000)
+    base_dir: Path = field(default_factory=lambda: _BASE_DIR)
+    @property
+    def data_search_roots(self) -> list[Path]:
+        models_path = Path(self.models_dir)
+        return [
+            self.base_dir / models_path,
+            self.base_dir,
+            self.base_dir.parent / models_path,
+            self.base_dir.parent / "RAG_pipeline" / models_path,
+            self.base_dir.parent / "RAG_pipeline",
+            self.base_dir.parent,
+        ]
+settings = Settings()

main.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from fastapi import FastAPI, HTTPException
+from contextlib import asynccontextmanager
+from config import settings
+from rag_service import preload, rag_query, state
+from schemas import QueryRequest, QueryResponse
+@asynccontextmanager
+async def lifespan(_app: FastAPI):
+    preload()
+    yield
+app = FastAPI(title=settings.app_title, lifespan=lifespan)
+@app.get("/")
+def root():
+    model_runtime_device = None
+    if state.model is not None:
+        model_runtime_device = str(next(state.model.parameters()).device)
+    return {
+        "message": "RAG API is running",
+        "device": state.device,
+        "model_runtime_device": model_runtime_device,
+        "model_dtype": str(state.model_dtype),
+        "startup_timing": state.startup_timing,
+    }
+@app.post("/query", response_model=QueryResponse)
+def query(payload: QueryRequest):
+    if state.index is None or state.embedding_model is None or state.model is None:
+        raise HTTPException(status_code=503, detail="Model is not loaded yet")
+    result = rag_query(payload.question, k=payload.k)
+    return QueryResponse(**result)

rag_service.py ADDED Viewed

	@@ -0,0 +1,248 @@

+from pathlib import Path
+import pickle
+import time
+import faiss
+import numpy as np
+import torch
+from InstructorEmbedding import INSTRUCTOR
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from config import settings
+class _CompatDocument:
+    """Fallback placeholder for pickled langchain Document objects."""
+    pass
+class _CompatUnpickler(pickle.Unpickler):
+    """Map langchain document class to a lightweight local placeholder."""
+    def find_class(self, module, name):
+        if module == "langchain_core.documents.base" and name == "Document":
+            return _CompatDocument
+        return super().find_class(module, name)
+def _load_chunks(path: Path):
+    """Load chunks.pkl with normal pickle, then fallback if langchain_core is absent."""
+    with path.open("rb") as f:
+        try:
+            return pickle.load(f)
+        except ModuleNotFoundError as e:
+            if e.name != "langchain_core":
+                raise
+    with path.open("rb") as f:
+        return _CompatUnpickler(f).load()
+def _chunk_payload(chunk):
+    """Return the serialized payload for both real and fallback document objects."""
+    if hasattr(chunk, "page_content") and hasattr(chunk, "metadata"):
+        return {
+            "page_content": chunk.page_content,
+            "metadata": chunk.metadata,
+        }
+    raw = getattr(chunk, "__dict__", {})
+    nested = raw.get("__dict__", raw)
+    if isinstance(nested, dict):
+        return nested
+    return {}
+def _chunk_page_content(chunk):
+    return _chunk_payload(chunk).get("page_content", "")
+def _chunk_metadata(chunk):
+    return _chunk_payload(chunk).get("metadata", {})
+def find_data_file(filename: str) -> Path:
+    explicit = Path(filename)
+    if explicit.is_absolute() and explicit.exists():
+        return explicit
+    for root in settings.data_search_roots:
+        candidate = root / filename
+        if candidate.exists():
+            return candidate
+    raise FileNotFoundError(f"Could not find {filename} in expected locations")
+class AppState:
+    def __init__(self):
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.model_id = settings.model_id
+        self.model_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        self.index = None
+        self.chunks = None
+        self.embedding_model = None
+        self.model = None
+        self.tokenizer = None
+        self.startup_timing = {}
+state = AppState()
+def retrieve_chunks(query: str, k: int) -> list:
+    query_embedding = state.embedding_model.encode([[settings.retrieval_instruction, query]])[0]
+    query_vector = np.array([query_embedding]).astype("float32")
+    _distances, indices = state.index.search(query_vector, k)
+    return [state.chunks[i] for i in indices[0]]
+def generate_answer(question: str, retrieved_chunks: list) -> str:
+    context = ""
+    for i, chunk in enumerate(retrieved_chunks):
+        context += f"Source {i + 1}:\n{_chunk_page_content(chunk)}\n\n"
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You are a helpful assistant that answers questions using ONLY the provided sources. "
+                "Synthesize information from ALL sources given. "
+                "Give a complete and coherent answer. "
+                "Do not cut off mid sentence. "
+                "If the sources do not contain enough information say so clearly."
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                f"Question: {question}\n\n"
+                f"{context}"
+                "Based on ALL the sources above provide a complete answer to the question."
+            ),
+        },
+    ]
+    text = state.tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    inputs = state.tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        max_length=settings.max_context_tokens,
+    ).to(state.device)
+    with torch.no_grad():
+        output = state.model.generate(
+            **inputs,
+            max_new_tokens=settings.max_new_tokens,
+            temperature=settings.temperature,
+            do_sample=True,
+            pad_token_id=state.tokenizer.eos_token_id,
+            repetition_penalty=settings.repetition_penalty,
+        )
+    generated_tokens = output[0][inputs["input_ids"].shape[1] :]
+    return state.tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
+def rag_query(question: str, k: int) -> dict:
+    t0 = time.perf_counter()
+    t_retrieve_start = time.perf_counter()
+    retrieved = retrieve_chunks(question, k=k)
+    retrieval_time = time.perf_counter() - t_retrieve_start
+    t_generate_start = time.perf_counter()
+    answer = generate_answer(question, retrieved)
+    generation_time = time.perf_counter() - t_generate_start
+    total_time = time.perf_counter() - t0
+    sources = [_chunk_metadata(chunk).get("url", "") for chunk in retrieved]
+    return {
+        "question": question,
+        "answer": answer,
+        "sources": sources,
+        "timing": {
+            "retrieval_seconds": retrieval_time,
+            "generation_seconds": generation_time,
+            "total_seconds": total_time,
+        },
+    }
+def preload() -> dict:
+    t0 = time.perf_counter()
+    print(f"Using device : {state.device}")
+    if torch.cuda.is_available():
+        gpu_name = torch.cuda.get_device_name(0)
+        print(f"CUDA available : True ({gpu_name})")
+        if torch.cuda.is_bf16_supported():
+            state.model_dtype = torch.bfloat16
+        else:
+            state.model_dtype = torch.float16
+        print(f"Model dtype   : {state.model_dtype}")
+    else:
+        print("CUDA available : False")
+        state.model_dtype = torch.float32
+    print("Loading vector DB...")
+    t_index = time.perf_counter()
+    index_path = find_data_file(settings.vector_db_file)
+    state.index = faiss.read_index(str(index_path))
+    index_time = time.perf_counter() - t_index
+    print(f"Index loaded : {state.index.ntotal} vectors")
+    print("Loading chunks...")
+    t_chunks = time.perf_counter()
+    chunks_path = find_data_file(settings.chunks_file)
+    state.chunks = _load_chunks(chunks_path)
+    chunks_time = time.perf_counter() - t_chunks
+    print(f"Chunks loaded : {len(state.chunks)}")
+    print("Loading embedding model...")
+    t_embed = time.perf_counter()
+    state.embedding_model = INSTRUCTOR(settings.embedding_model_id)
+    if torch.cuda.is_available():
+        try:
+            state.embedding_model.to(state.device)
+        except Exception:
+            # Some InstructorEmbedding backends do not expose .to(); keep CPU fallback.
+            pass
+    embedding_time = time.perf_counter() - t_embed
+    print(f"Loading {settings.model_id}...")
+    t_model = time.perf_counter()
+    state.model = AutoModelForCausalLM.from_pretrained(
+        settings.model_id,
+        torch_dtype=state.model_dtype,
+        device_map={"": state.device},
+    )
+    state.tokenizer = AutoTokenizer.from_pretrained(settings.model_id)
+    state.model.eval()
+    model_time = time.perf_counter() - t_model
+    first_param_device = str(next(state.model.parameters()).device)
+    print(f"LLM loaded on  : {first_param_device}")
+    total_startup = time.perf_counter() - t0
+    state.startup_timing = {
+        "index_load_seconds": index_time,
+        "chunks_load_seconds": chunks_time,
+        "embedding_model_load_seconds": embedding_time,
+        "llm_load_seconds": model_time,
+        "total_startup_seconds": total_startup,
+    }
+    print("RAG API preloaded successfully")
+    print(
+        f"Startup timing: total={total_startup:.2f}s, index={index_time:.2f}s, "
+        f"chunks={chunks_time:.2f}s, embedding={embedding_time:.2f}s, model={model_time:.2f}s"
+    )
+    return state.startup_timing

schemas.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from pydantic import BaseModel, Field
+from config import settings
+class QueryRequest(BaseModel):
+    question: str = Field(..., min_length=1, description="User question")
+    k: int = Field(
+        default=settings.default_top_k,
+        ge=settings.min_top_k,
+        le=settings.max_top_k,
+        description="Top-k chunks to retrieve",
+    )
+class TimingPayload(BaseModel):
+    retrieval_seconds: float
+    generation_seconds: float
+    total_seconds: float
+class QueryResponse(BaseModel):
+    question: str
+    answer: str
+    sources: list[str]
+    timing: TimingPayload