Spaces:

Jay-10020
/

cortexa-ai

Running

App Files Files Community

Jay-10020 commited on about 2 hours ago

Commit

4e4501d

1 Parent(s): c766e4c

RAG test1

Browse files

Files changed (3) hide show

api/main.py +49 -0
hybrid/assistant.py +15 -34
models/llm.py +18 -26

api/main.py CHANGED Viewed

@@ -85,6 +85,17 @@ class HybridQueryRequest(BaseModel):
     use_web_fallback: bool = True
 # NEW: Speech-to-Text Models
 class TranscribeRequest(BaseModel):
     audio_filename: str
@@ -512,6 +523,44 @@ async def hybrid_query(request: HybridQueryRequest):
         raise HTTPException(status_code=500, detail=str(e))
 # ============================================================================
 # VOICE-TO-TEXT ENDPOINTS (NEW)
 # ============================================================================

     use_web_fallback: bool = True
+# Fast endpoints for Node-side orchestration
+class EmbedRequest(BaseModel):
+    text: str
+class GenerateRequest(BaseModel):
+    query: str
+    context: str
+    source_type: str = "documents"  # "documents" | "web"
 # NEW: Speech-to-Text Models
 class TranscribeRequest(BaseModel):
     audio_filename: str
         raise HTTPException(status_code=500, detail=str(e))
+# ============================================================================
+# FAST PRIMITIVE ENDPOINTS  (used by Node backend for server-side RAG)
+# ============================================================================
+@app.post("/embed")
+async def embed_text(request: EmbedRequest):
+    """
+    Embed a single text string and return its float vector.
+    Uses only the sentence-transformer (fast, no LLM needed).
+    """
+    try:
+        from models.embeddings import get_embedding_model
+        embedding_model = get_embedding_model()
+        vector = embedding_model.encode_query(request.text)
+        return {"embedding": vector.tolist(), "dimension": len(vector)}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/generate")
+async def generate_answer(request: GenerateRequest):
+    """
+    Generate a short answer given pre-built context.
+    Called by the Node backend after it has already done retrieval from MongoDB.
+    Much faster than /assistant because no retrieval step happens here.
+    """
+    try:
+        assistant = get_hybrid_assistant_instance()
+        answer = assistant._generate_answer(
+            query=request.query,
+            context=request.context,
+            source_type=request.source_type,
+        )
+        return {"answer": answer}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
 # ============================================================================
 # VOICE-TO-TEXT ENDPOINTS (NEW)
 # ============================================================================

hybrid/assistant.py CHANGED Viewed

@@ -126,46 +126,27 @@ class HybridAssistant:
         context: str,
         source_type: str
     ) -> str:
-        """Generate answer from context"""
-        if source_type == "documents":
-            prompt = f"""You are a helpful AI assistant. Answer the question using ONLY the information from the provided context.
-Context from uploaded documents:
-{context}
-Question: {query}
-Instructions:
-- Answer based on the context above
-- Cite sources using [Source 1], [Source 2], etc.
-- If the context doesn't fully answer the question, say so
-- Be concise and accurate
-Answer:"""
-        else:  # web sources
-            prompt = f"""You are a helpful AI assistant. Answer the question using the information from web search results.
-Web search results:
-{context}
-Question: {query}
-Instructions:
-- Synthesize information from the web sources
-- Cite sources using [Web Source 1], [Web Source 2], etc.
-- Provide accurate and helpful information
-- Be concise
-Answer:"""
         response = self.llm.generate(
             prompt=prompt,
-            max_new_tokens=512,
-            temperature=0.7
         )
         return response.strip()
 # Singleton

         context: str,
         source_type: str
     ) -> str:
+        """Generate answer from context using TinyLlama chat format for speed."""
+        # TinyLlama chat template — keeps token count minimal for fast inference
+        if source_type == "documents":
+            system = "You are a helpful study assistant. Answer ONLY using the provided context. Cite [Source N] inline."
+            user_content = f"Context:\n{context[:1500]}\n\nQuestion: {query}"
+        else:
+            system = "You are a helpful assistant. Summarise the web results to answer the question concisely. Cite [Web N] inline."
+            user_content = f"Web results:\n{context[:1500]}\n\nQuestion: {query}"
+        prompt = (
+            f"<|system|>\n{system}</s>\n"
+            f"<|user|>\n{user_content}</s>\n"
+            f"<|assistant|>\n"
+        )
         response = self.llm.generate(
             prompt=prompt,
+            max_new_tokens=150,
         )
         return response.strip()
 # Singleton

models/llm.py CHANGED Viewed

@@ -60,43 +60,35 @@ class LanguageModel:
     def generate(
         self,
         prompt: str,
-        max_new_tokens: int = MAX_NEW_TOKENS,
         temperature: float = TEMPERATURE,
         top_p: float = TOP_P
     ) -> str:
         """
-        Generate text from prompt
-        Args:
-            prompt: Input prompt
-            max_new_tokens: Maximum tokens to generate
-            temperature: Sampling temperature
-            top_p: Top-p sampling
-        Returns:
-            Generated text
         """
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
                 max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                do_sample=True,
                 pad_token_id=self.tokenizer.pad_token_id,
-                eos_token_id=self.tokenizer.eos_token_id
             )
-        # Decode and remove input prompt
-        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Remove the input prompt from output
-        if generated_text.startswith(prompt):
-            generated_text = generated_text[len(prompt):].strip()
-        return generated_text
 # Singleton instance
 _llm_model = None

     def generate(
         self,
         prompt: str,
+        max_new_tokens: int = 150,
         temperature: float = TEMPERATURE,
         top_p: float = TOP_P
     ) -> str:
         """
+        Generate text from prompt using greedy decoding for speed.
         """
+        inputs = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=1024,  # cap input to avoid OOM and slow processing
+        ).to(self.model.device)
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
                 max_new_tokens=max_new_tokens,
+                do_sample=False,          # greedy — ~3x faster than sampling
                 pad_token_id=self.tokenizer.pad_token_id,
+                eos_token_id=self.tokenizer.eos_token_id,
+                repetition_penalty=1.1,   # avoid repetition loops
             )
+        # Decode only the newly generated tokens (skip input)
+        input_len = inputs["input_ids"].shape[1]
+        generated_ids = outputs[0][input_len:]
+        generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+        return generated_text.strip()
 # Singleton instance
 _llm_model = None