Spaces:

usamabhatticoder
/

omni-agent-core

Sleeping

App Files Files Community

USAMA BHATTI commited on Dec 18, 2025

Commit

a833774

1 Parent(s): 239dbce

Feat: Implement Secure Multi-tenant SaaS Architecture with API Key Auth, Domain Whitelisting, and Strict AI Grounding

Browse files

Files changed (7) hide show

backend/src/api/routes/ingestion.py +66 -45
backend/src/services/ingestion/crawler.py +55 -44
backend/src/services/ingestion/file_processor.py +53 -50
backend/src/services/ingestion/guardrail_factory.py +31 -14
backend/src/services/ingestion/web_processor.py +47 -28
backend/src/services/ingestion/zip_processor.py +73 -29
backend/src/services/vector_store/qdrant_adapter.py +35 -63

backend/src/api/routes/ingestion.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import shutil
 from fastapi import APIRouter, UploadFile, File, HTTPException, Form, BackgroundTasks, Depends
@@ -6,11 +5,11 @@ from pydantic import BaseModel
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
-# --- Security Imports ---
 from backend.src.api.routes.deps import get_current_user
 from backend.src.models.user import User
-# --- Internal Services & DB Imports ---
 from backend.src.services.ingestion.file_processor import process_file
 from backend.src.services.ingestion.crawler import SmartCrawler
 from backend.src.services.ingestion.zip_processor import SmartZipProcessor
@@ -20,55 +19,67 @@ from backend.src.models.ingestion import IngestionJob, JobStatus, IngestionType
 # --- CONFIG ---
 MAX_ZIP_SIZE_MB = 100
 MAX_ZIP_SIZE_BYTES = MAX_ZIP_SIZE_MB * 1024 * 1024
 router = APIRouter()
-UPLOAD_DIRECTORY = "./uploaded_files"
 # ==========================================
-# FILE UPLOAD (Protected)
 # ==========================================
 @router.post("/ingest/upload")
 async def upload_and_process_file(
     session_id: str = Form(...),
     file: UploadFile = File(...),
-    current_user: User = Depends(get_current_user) # <--- 🔒 TALA LAGA DIYA
 ):
-    # (Function logic same rahegi, bas ab current_user mil jayega)
     if not os.path.exists(UPLOAD_DIRECTORY):
         os.makedirs(UPLOAD_DIRECTORY)
     file_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
     try:
         with open(file_path, "wb") as buffer:
             shutil.copyfileobj(file.file, buffer)
-        chunks_added = await process_file(file_path, session_id)
-        if chunks_added <= 0:
-            raise HTTPException(status_code=400, detail="Could not process file.")
         return {
-            "message": "File processed successfully",
             "filename": file.filename,
-            "chunks_added": chunks_added,
-            "session_id": session_id
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
     finally:
-        if os.path.exists(file_path):
-            os.remove(file_path)
 # ==========================================
-# WEB CRAWLER (Protected)
 # ==========================================
 class WebIngestRequest(BaseModel):
     url: str
     session_id: str
     crawl_type: str = "single_page"
-async def run_crawler_task(job_id, url, session_id, crawl_type, db_factory):
     async with db_factory() as db:
-        crawler = SmartCrawler(job_id, url, session_id, crawl_type, db)
         await crawler.start()
 @router.post("/ingest/url")
@@ -76,9 +87,8 @@ async def start_web_ingestion(
     request: WebIngestRequest,
     background_tasks: BackgroundTasks,
     db: AsyncSession = Depends(get_db),
-    current_user: User = Depends(get_current_user) # <--- 🔒 TALA LAGA DIYA
 ):
-    # (Function logic same rahegi)
     new_job = IngestionJob(
         session_id=request.session_id,
         ingestion_type=IngestionType.URL,
@@ -89,28 +99,21 @@ async def start_web_ingestion(
     await db.commit()
     await db.refresh(new_job)
-    background_tasks.add_task(run_crawler_task, new_job.id, request.url, request.session_id, request.crawl_type, AsyncSessionLocal)
-    return {"message": "Ingestion job started", "job_id": new_job.id}
-@router.get("/ingest/status/{job_id}")
-async def check_job_status(
-    job_id: int,
-    db: AsyncSession = Depends(get_db),
-    current_user: User = Depends(get_current_user) # <--- 🔒 TALA LAGA DIYA
-):
-    # (Function logic same rahegi)
-    result = await db.execute(select(IngestionJob).where(IngestionJob.id == job_id))
-    job = result.scalars().first()
-    if not job:
-        raise HTTPException(status_code=404, detail="Job not found")
-    return job
 # ==========================================
-# BULK ZIP UPLOAD (Protected)
 # ==========================================
-async def run_zip_task(job_id, zip_path, session_id, db_factory):
     async with db_factory() as db:
-        processor = SmartZipProcessor(job_id, zip_path, session_id, db)
         await processor.start()
 @router.post("/ingest/upload-zip")
@@ -119,13 +122,10 @@ async def upload_and_process_zip(
     file: UploadFile = File(...),
     background_tasks: BackgroundTasks = BackgroundTasks(),
     db: AsyncSession = Depends(get_db),
-    current_user: User = Depends(get_current_user) # <--- 🔒 TALA LAGA DIYA
 ):
-    # (Function logic same rahegi)
     if not file.filename.endswith(".zip"):
-        raise HTTPException(status_code=400, detail="Only .zip files are allowed.")
-    if file.size > MAX_ZIP_SIZE_BYTES:
-        raise HTTPException(status_code=413, detail=f"File too large. Max size is {MAX_ZIP_SIZE_MB} MB.")
     zip_dir = os.path.join(UPLOAD_DIRECTORY, "zips")
     os.makedirs(zip_dir, exist_ok=True)
@@ -144,5 +144,26 @@ async def upload_and_process_zip(
     await db.commit()
     await db.refresh(new_job)
-    background_tasks.add_task(run_zip_task, new_job.id, file_path, session_id, AsyncSessionLocal)
-    return {"message": "Zip processing started", "job_id": new_job.id}

 import os
 import shutil
 from fastapi import APIRouter, UploadFile, File, HTTPException, Form, BackgroundTasks, Depends
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
+# --- Security & User Context ---
 from backend.src.api.routes.deps import get_current_user
 from backend.src.models.user import User
+# --- Internal Services ---
 from backend.src.services.ingestion.file_processor import process_file
 from backend.src.services.ingestion.crawler import SmartCrawler
 from backend.src.services.ingestion.zip_processor import SmartZipProcessor
 # --- CONFIG ---
 MAX_ZIP_SIZE_MB = 100
 MAX_ZIP_SIZE_BYTES = MAX_ZIP_SIZE_MB * 1024 * 1024
+UPLOAD_DIRECTORY = "./uploaded_files"
 router = APIRouter()
 # ==========================================
+# 1. INDIVIDUAL FILE UPLOAD (Secure ✅)
 # ==========================================
 @router.post("/ingest/upload")
 async def upload_and_process_file(
     session_id: str = Form(...),
     file: UploadFile = File(...),
+    db: AsyncSession = Depends(get_db), # DB session add ki
+    current_user: User = Depends(get_current_user)
 ):
     if not os.path.exists(UPLOAD_DIRECTORY):
         os.makedirs(UPLOAD_DIRECTORY)
     file_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
     try:
+        # File temporary save karein
         with open(file_path, "wb") as buffer:
             shutil.copyfileobj(file.file, buffer)
+        # 🚀 PASSING USER CONTEXT: process_file ab user_id aur db mang raha hai
+        chunks_added = await process_file(
+            file_path=file_path,
+            session_id=session_id,
+            user_id=str(current_user.id),
+            db=db
+        )
+        if chunks_added == -1: # Database not connected error
+            raise HTTPException(status_code=400, detail="Database not connected. Please go to User Settings first.")
+        elif chunks_added <= 0:
+            raise HTTPException(status_code=400, detail="Could not extract content from file.")
         return {
+            "status": "success",
             "filename": file.filename,
+            "chunks": chunks_added,
+            "owner_id": current_user.id
         }
+    except HTTPException as he: raise he
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
     finally:
+        if os.path.exists(file_path): os.remove(file_path)
 # ==========================================
+# 2. WEB CRAWLER (Secure Background Task ✅)
 # ==========================================
 class WebIngestRequest(BaseModel):
     url: str
     session_id: str
     crawl_type: str = "single_page"
+# Helper to run crawler in background with User ID
+async def run_crawler_task(job_id, url, session_id, crawl_type, db_factory, user_id):
     async with db_factory() as db:
+        # 🚀 PASSING USER ID: Crawler ko bataya kis ka data hai
+        crawler = SmartCrawler(job_id, url, session_id, crawl_type, db, user_id=user_id)
         await crawler.start()
 @router.post("/ingest/url")
     request: WebIngestRequest,
     background_tasks: BackgroundTasks,
     db: AsyncSession = Depends(get_db),
+    current_user: User = Depends(get_current_user)
 ):
     new_job = IngestionJob(
         session_id=request.session_id,
         ingestion_type=IngestionType.URL,
     await db.commit()
     await db.refresh(new_job)
+    # 🚀 BACKGROUND LINK: Pass user_id to the task
+    background_tasks.add_task(
+        run_crawler_task,
+        new_job.id, request.url, request.session_id, request.crawl_type,
+        AsyncSessionLocal, str(current_user.id)
+    )
+    return {"message": "Crawler started securely", "job_id": new_job.id}
 # ==========================================
+# 3. BULK ZIP UPLOAD (Secure Background Task ✅)
 # ==========================================
+async def run_zip_task(job_id, zip_path, session_id, db_factory, user_id):
     async with db_factory() as db:
+        # 🚀 PASSING USER ID: Zip processor ab owner-aware hai
+        processor = SmartZipProcessor(job_id, zip_path, session_id, db, user_id=user_id)
         await processor.start()
 @router.post("/ingest/upload-zip")
     file: UploadFile = File(...),
     background_tasks: BackgroundTasks = BackgroundTasks(),
     db: AsyncSession = Depends(get_db),
+    current_user: User = Depends(get_current_user)
 ):
     if not file.filename.endswith(".zip"):
+        raise HTTPException(status_code=400, detail="Invalid format. ZIP only.")
     zip_dir = os.path.join(UPLOAD_DIRECTORY, "zips")
     os.makedirs(zip_dir, exist_ok=True)
     await db.commit()
     await db.refresh(new_job)
+    # 🚀 BACKGROUND LINK: Pass user_id to the task
+    background_tasks.add_task(
+        run_zip_task,
+        new_job.id, file_path, session_id,
+        AsyncSessionLocal, str(current_user.id)
+    )
+    return {"message": "Secure Zip processing scheduled", "job_id": new_job.id}
+# ==========================================
+# 4. STATUS CHECKER (Secure ✅)
+# ==========================================
+@router.get("/ingest/status/{job_id}")
+async def check_job_status(
+    job_id: int,
+    db: AsyncSession = Depends(get_db),
+    current_user: User = Depends(get_current_user)
+):
+    # Only allow users to see their own session jobs? (Optional improvement)
+    result = await db.execute(select(IngestionJob).where(IngestionJob.id == job_id))
+    job = result.scalars().first()
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+    return job

backend/src/services/ingestion/crawler.py CHANGED Viewed

@@ -1,74 +1,94 @@
 import asyncio
 import requests
 import numpy as np
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from sqlalchemy.ext.asyncio import AsyncSession
 from backend.src.models.ingestion import IngestionJob, JobStatus
 from backend.src.services.vector_store.qdrant_adapter import get_vector_store
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 from qdrant_client.http import models
-# --- NEW IMPORT ---
 from backend.src.services.ingestion.guardrail_factory import predict_with_model
-# --- CONFIGURATION ---
 MAX_PAGES_LIMIT = 50
 class SmartCrawler:
-    def __init__(self, job_id: int, url: str, session_id: str, crawl_type: str, db: AsyncSession):
         self.job_id = job_id
         self.root_url = url
         self.session_id = session_id
         self.crawl_type = crawl_type
         self.db = db
         self.visited = set()
-        self.vector_store = get_vector_store()
-        # YAHAN SE MODEL LOAD HATA DIYA
     async def log_status(self, status: str, processed=0, total=0, error=None):
         try:
-            job = await self.db.get(IngestionJob, self.job_id)
             if job:
                 job.status = status
-                job.pages_processed = processed
-                job.total_pages_found = total
                 if error:
                     job.error_message = str(error)
                 await self.db.commit()
         except Exception as e:
             print(f"DB Log Error: {e}")
-    async def is_ai_unsafe(self, text: str, url: str) -> bool: # <--- Async bana diya
         """
-        Non-blocking AI Check using Factory.
         """
         sample_text = text[:300] + " ... " + text[len(text)//2 : len(text)//2 + 300]
         label = "This is an e-commerce product page with price, buy button, or shopping cart."
-        # --- FIX: Call Factory Async Function ---
-        # Ab ye server ko block nahi karega
         scores = await predict_with_model(sample_text, label)
-        # Softmax Calculation
         probs = np.exp(scores) / np.sum(np.exp(scores))
         entailment_score = probs[1]
-        print("\n" + "="*60)
-        print(f"🤖 AI ANALYSIS REPORT for: {url}")
-        print("-" * 60)
-        print(f"📊 Scores -> Contradiction: {probs[0]:.2f}, Entailment: {probs[1]:.2f}, Neutral: {probs[2]:.2f}")
-        print(f"🎯 Target Score (Entailment): {entailment_score:.4f} (Threshold: 0.5)")
         if entailment_score > 0.5:
-            print(f"⛔ DECISION: BLOCKED")
-            print("="*60 + "\n")
             return True
-        else:
-            print(f"✅ DECISION: ALLOWED")
-            print("="*60 + "\n")
-            return False
     async def fetch_page(self, url: str):
         try:
@@ -84,12 +104,7 @@ class SmartCrawler:
                 collection_name=self.vector_store.collection_name,
                 points_selector=models.FilterSelector(
                     filter=models.Filter(
-                        must=[
-                            models.FieldCondition(
-                                key="metadata.source",
-                                match=models.MatchValue(value=self.root_url)
-                            )
-                        ]
                     )
                 )
             )
@@ -101,15 +116,9 @@ class SmartCrawler:
             script.extract()
         text = soup.get_text(separator=" ", strip=True)
-        if len(text) < 200:
-            print(f"⚠️ Skipping {url} (Not enough text: {len(text)} chars)")
-            return False
-        # --- AWAIT HERE ---
-        # Ab hum 'await' use kar rahe hain taake ye background mein chale
-        if await self.is_ai_unsafe(text, url):
-            return False
         splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
         docs = [Document(page_content=text, metadata={
@@ -125,6 +134,11 @@ class SmartCrawler:
     async def start(self):
         try:
             await self.log_status(JobStatus.PROCESSING)
             await self.clean_existing_data()
@@ -134,13 +148,10 @@ class SmartCrawler:
             while queue and total_processed < MAX_PAGES_LIMIT:
                 current_url = queue.pop(0)
                 response = await self.fetch_page(current_url)
-                if not response or response.status_code != 200:
-                    continue
                 soup = BeautifulSoup(response.content, 'html.parser')
                 success = await self.process_page(current_url, soup)
                 if not success:

 import asyncio
 import requests
+import json # Credentials decode karne ke liye
 import numpy as np
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select # Query karne ke liye
 from backend.src.models.ingestion import IngestionJob, JobStatus
+from backend.src.models.integration import UserIntegration # integration model import kiya
 from backend.src.services.vector_store.qdrant_adapter import get_vector_store
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 from qdrant_client.http import models
 from backend.src.services.ingestion.guardrail_factory import predict_with_model
 MAX_PAGES_LIMIT = 50
 class SmartCrawler:
+    # 1. Init mein 'user_id' add kiya taake hum uski settings dhoond saken
+    def __init__(self, job_id: int, url: str, session_id: str, crawl_type: str, db: AsyncSession, user_id: str):
         self.job_id = job_id
         self.root_url = url
         self.session_id = session_id
         self.crawl_type = crawl_type
         self.db = db
+        self.user_id = user_id # Owner ID
         self.visited = set()
+        self.vector_store = None # Shuru mein None rakhein, verification ke baad fill hoga
     async def log_status(self, status: str, processed=0, total=0, error=None):
         try:
+            # SQL Alchemy 2.0 style query
+            result = await self.db.execute(select(IngestionJob).where(IngestionJob.id == self.job_id))
+            job = result.scalars().first()
             if job:
                 job.status = status
+                job.items_processed = processed # Column name match karein (items_processed)
+                job.total_items = total
                 if error:
                     job.error_message = str(error)
                 await self.db.commit()
         except Exception as e:
             print(f"DB Log Error: {e}")
+    # --- NEW: STRICT DATABASE VERIFICATION SKILL ---
+    async def verify_and_connect_db(self) -> bool:
         """
+        Check if user has a valid Qdrant Cloud integration.
         """
+        print(f"🔍 Verifying Database for User ID: {self.user_id}")
+        try:
+            stmt = select(UserIntegration).where(
+                UserIntegration.user_id == str(self.user_id),
+                UserIntegration.provider == "qdrant",
+                UserIntegration.is_active == True
+            )
+            result = await self.db.execute(stmt)
+            integration = result.scalars().first()
+            if not integration:
+                error_msg = "❌ No Qdrant Cloud connected. Please go to 'Settings' and connect your database first."
+                print(error_msg)
+                await self.log_status(JobStatus.FAILED, error=error_msg)
+                return False
+            # User ki encrypted/json credentials nikalen
+            creds = json.loads(integration.credentials) if isinstance(integration.credentials, str) else integration.credentials
+            # Smart Adapter ko user ki chabiyan (keys) bhejein
+            self.vector_store = get_vector_store(credentials=creds)
+            return True
+        except Exception as e:
+            await self.log_status(JobStatus.FAILED, error=f"Database Connection Error: {str(e)}")
+            return False
+    async def is_ai_unsafe(self, text: str, url: str) -> bool:
         sample_text = text[:300] + " ... " + text[len(text)//2 : len(text)//2 + 300]
         label = "This is an e-commerce product page with price, buy button, or shopping cart."
         scores = await predict_with_model(sample_text, label)
         probs = np.exp(scores) / np.sum(np.exp(scores))
         entailment_score = probs[1]
         if entailment_score > 0.5:
+            print(f"⛔ AI BLOCKED (E-commerce): {url}")
             return True
+        return False
     async def fetch_page(self, url: str):
         try:
                 collection_name=self.vector_store.collection_name,
                 points_selector=models.FilterSelector(
                     filter=models.Filter(
+                        must=[models.FieldCondition(key="metadata.source", match=models.MatchValue(value=self.root_url))]
                     )
                 )
             )
             script.extract()
         text = soup.get_text(separator=" ", strip=True)
+        if len(text) < 200: return False
+        if await self.is_ai_unsafe(text, url): return False
         splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
         docs = [Document(page_content=text, metadata={
     async def start(self):
         try:
+            # 1. PEHLA KAAM: Database check karo
+            db_ready = await self.verify_and_connect_db()
+            if not db_ready:
+                return # Stop process if no DB
             await self.log_status(JobStatus.PROCESSING)
             await self.clean_existing_data()
             while queue and total_processed < MAX_PAGES_LIMIT:
                 current_url = queue.pop(0)
                 response = await self.fetch_page(current_url)
+                if not response or response.status_code != 200: continue
                 soup = BeautifulSoup(response.content, 'html.parser')
                 success = await self.process_page(current_url, soup)
                 if not success:

backend/src/services/ingestion/file_processor.py CHANGED Viewed

@@ -1,94 +1,97 @@
-# backend/src/services/ingestion/file_processor.py
 import os
 import asyncio
 # Specific Stable Loaders
 from langchain_community.document_loaders import (
     TextLoader,
     PyPDFLoader,
     CSVLoader,
     Docx2txtLoader,
-    UnstructuredMarkdownLoader
 )
-# Fallback loader (agar upar walon mein se koi na ho)
-from langchain_community.document_loaders import UnstructuredFileLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from backend.src.services.vector_store.qdrant_adapter import get_vector_store
 def get_loader(file_path: str):
     """
     Factory function jo file extension ke hisaab se
-    sabse stable loader return karta hai.
     """
     ext = os.path.splitext(file_path)[1].lower()
     if ext == ".txt":
-        # TextLoader sabse fast aur safe hai
         return TextLoader(file_path, encoding="utf-8")
     elif ext == ".pdf":
-        # PyPDFLoader pure python hai, hang nahi hota
         return PyPDFLoader(file_path)
     elif ext == ".csv":
         return CSVLoader(file_path, encoding="utf-8")
     elif ext in [".doc", ".docx"]:
-        # Docx2txtLoader light hai
         return Docx2txtLoader(file_path)
     elif ext == ".md":
-        # Markdown ko hum TextLoader se bhi parh sakte hain agar Unstructured tang kare
         return TextLoader(file_path, encoding="utf-8")
     else:
-        # Agar koi ajeeb format ho, tab hum Heavy 'Unstructured' loader try karenge
-        print(f"INFO: Unknown format '{ext}', attempting to use UnstructuredFileLoader...")
         return UnstructuredFileLoader(file_path)
-async def process_file(file_path: str, session_id: str):
     """
-    Processes a single uploaded file and adds it to the Vector DB.
-    Supports: TXT, PDF, CSV, DOCX, MD and others.
     """
-    print(f"INFO: [Ingestion] Starting processing for file: {file_path}")
     try:
-        # 1. Sahi Loader select karein
-        loader = get_loader(file_path)
-        # 2. File Load karein (Thread mein taake server block na ho)
-        # Note: 'aload()' har loader ke paas nahi hota, isliye hum standard 'load()' ko async wrap karte hain
         docs = await asyncio.to_thread(loader.load)
-    except Exception as e:
-        print(f"ERROR: [Ingestion] Failed to load file {file_path}: {e}")
-        return 0
-    if not docs:
-        print(f"WARNING: [Ingestion] Could not extract any content from {file_path}")
-        return 0
-    # 3. Document ko chunks mein todein
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len
-    )
-    split_docs = text_splitter.split_documents(docs)
-    # Metadata update (Source tracking ke liye)
-    for doc in split_docs:
-        doc.metadata["session_id"] = session_id
-        doc.metadata["file_name"] = os.path.basename(file_path)
-        # Extension bhi store kar lete hain filter karne ke liye
-        doc.metadata["file_type"] = os.path.splitext(file_path)[1].lower()
-    # 4. Qdrant mein upload karein
-    try:
-        vector_store = get_vector_store()
         await vector_store.aadd_documents(split_docs)
-        print(f"SUCCESS: [Ingestion] Processed {len(split_docs)} chunks from {file_path}")
         return len(split_docs)
     except Exception as e:
-        print(f"ERROR: [Ingestion] Failed to upload to Qdrant: {e}")
         return 0

 import os
 import asyncio
+import json
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
 # Specific Stable Loaders
 from langchain_community.document_loaders import (
     TextLoader,
     PyPDFLoader,
     CSVLoader,
     Docx2txtLoader,
+    UnstructuredMarkdownLoader,
+    UnstructuredFileLoader
 )
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from backend.src.services.vector_store.qdrant_adapter import get_vector_store
+from backend.src.models.integration import UserIntegration # Integration model zaroori hai
 def get_loader(file_path: str):
     """
     Factory function jo file extension ke hisaab se
+    loader return karta hai.
     """
     ext = os.path.splitext(file_path)[1].lower()
     if ext == ".txt":
         return TextLoader(file_path, encoding="utf-8")
     elif ext == ".pdf":
         return PyPDFLoader(file_path)
     elif ext == ".csv":
         return CSVLoader(file_path, encoding="utf-8")
     elif ext in [".doc", ".docx"]:
         return Docx2txtLoader(file_path)
     elif ext == ".md":
         return TextLoader(file_path, encoding="utf-8")
     else:
         return UnstructuredFileLoader(file_path)
+# --- UPDATED: Added user_id and db session ---
+async def process_file(file_path: str, session_id: str, user_id: str, db: AsyncSession):
     """
+    Processes a single uploaded file strictly using the USER'S database.
     """
+    print(f"INFO: [Ingestion] Starting secure processing for user {user_id}: {file_path}")
     try:
+        # 1. DATABASE VERIFICATION: Check if user has Qdrant connected
+        stmt = select(UserIntegration).where(
+            UserIntegration.user_id == str(user_id),
+            UserIntegration.provider == "qdrant",
+            UserIntegration.is_active == True
+        )
+        result = await db.execute(stmt)
+        integration = result.scalars().first()
+        if not integration:
+            print(f"❌ ERROR: User {user_id} has no Qdrant connected.")
+            return -1 # Special code for 'No Database'
+        # 2. Extract Credentials
+        creds = json.loads(integration.credentials) if isinstance(integration.credentials, str) else integration.credentials
+        # 3. Connect to User's Cloud Qdrant (No Fallback to Localhost)
+        vector_store = get_vector_store(credentials=creds)
+        # 4. File Loading
+        loader = get_loader(file_path)
         docs = await asyncio.to_thread(loader.load)
+        if not docs:
+            print(f"WARNING: No content extracted from {file_path}")
+            return 0
+        # 5. Chunks Creation
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len
+        )
+        split_docs = text_splitter.split_documents(docs)
+        # Metadata logic
+        for doc in split_docs:
+            doc.metadata["session_id"] = session_id
+            doc.metadata["user_id"] = user_id
+            doc.metadata["file_name"] = os.path.basename(file_path)
+            doc.metadata["source"] = os.path.basename(file_path) # Search ke liye source zaroori hai
+        # 6. Upload to User's Vector DB
         await vector_store.aadd_documents(split_docs)
+        print(f"SUCCESS: Processed {len(split_docs)} chunks to user's Cloud Qdrant.")
         return len(split_docs)
     except Exception as e:
+        print(f"ERROR: [Ingestion] Critical failure: {e}")
         return 0

backend/src/services/ingestion/guardrail_factory.py CHANGED Viewed

@@ -1,28 +1,45 @@
 from sentence_transformers import CrossEncoder
-from functools import lru_cache
 import asyncio
-# Global Cache
 _model_instance = None
 def get_guardrail_model():
     """
-    Model ko sirf ek baar load karega.
     """
     global _model_instance
     if _model_instance is None:
-        print("⏳ INFO: Loading AI Guardrail Model into RAM (First Time Only)...")
-        # 'nli-distilroberta-base' thoda heavy hai, agar PC slow hai to 'cross-encoder/ms-marco-TinyBERT-L-2' use karein
-        _model_instance = CrossEncoder('cross-encoder/nli-distilroberta-base')
-        print("✅ INFO: AI Guardrail Model Loaded!")
     return _model_instance
-async def predict_with_model(text, label):
     """
-    Prediction ko background thread mein chalata hai taake server hang na ho.
     """
-    model = get_guardrail_model()
-    # Ye line magic hai: Heavy kaam ko alag thread mein bhej do
-    scores = await asyncio.to_thread(model.predict, [(text, label)])
-    return scores[0]

 from sentence_transformers import CrossEncoder
 import asyncio
+import os
+# Global Cache for Singleton Pattern
 _model_instance = None
 def get_guardrail_model():
     """
+    Skill: AI Guardrail Loader. Loads model into RAM only once.
+    Optimized for SaaS performance.
     """
     global _model_instance
     if _model_instance is None:
+        # Railway RAM optimization: Agar heavy model crash kare, toh TinyBERT use karein
+        # Default: nli-distilroberta-base
+        model_name = os.getenv("GUARDRAIL_MODEL", "cross-encoder/nli-distilroberta-base")
+        print(f"⏳ [AI-Guardrail] Loading Model: {model_name}...")
+        try:
+            _model_instance = CrossEncoder(model_name)
+            print("✅ [AI-Guardrail] Model ready for inference.")
+        except Exception as e:
+            print(f"❌ [AI-Guardrail] Failed to load model: {e}")
+            raise e
     return _model_instance
+async def predict_with_model(text: str, label: str):
     """
+    Skill: Asynchronous AI Prediction.
+    Ensures that heavy CPU tasks don't block the FastAPI event loop.
     """
+    try:
+        model = get_guardrail_model()
+        # Heavy computation offloaded to a separate thread (Non-blocking SaaS)
+        scores = await asyncio.to_thread(model.predict, [(text, label)])
+        # Returning only the score list
+        return scores[0]
+    except Exception as e:
+        print(f"⚠️ [AI-Guardrail] Prediction Error: {e}")
+        # Default score return (Neutral/Allow) in case of error to keep ingestion running
+        return [0.0, 0.0, 0.0]

backend/src/services/ingestion/web_processor.py CHANGED Viewed

@@ -1,17 +1,40 @@
 import asyncio
 from langchain_community.document_loaders import WebBaseLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from backend.src.services.vector_store.qdrant_adapter import get_vector_store
-async def process_url(url: str, session_id: str):
     """
-    Ek URL se data scrape karta hai, chunks banata hai aur Qdrant mein save karta hai.
     """
-    print(f"INFO: [Ingestion] Starting scraping for URL: {url}")
     try:
-        # 1. Load Data from URL
-        # Hum loader ko async thread mein chalayenge taake server block na ho
         def load_data():
             loader = WebBaseLoader(url)
             return loader.load()
@@ -22,32 +45,28 @@ async def process_url(url: str, session_id: str):
             print(f"WARNING: [Ingestion] No content found at {url}")
             return 0
-        print(f"INFO: [Ingestion] Successfully fetched content. Length: {len(docs[0].page_content)} chars.")
-    except Exception as e:
-        print(f"ERROR: [Ingestion] Failed to scrape URL {url}: {e}")
-        raise e # Error upar bhejenge taake API user ko bata sake
-    # 2. Split Text into Chunks
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len
-    )
-    split_docs = text_splitter.split_documents(docs)
-    # 3. Add Metadata (Bohat Zaroori)
-    for doc in split_docs:
-        doc.metadata["session_id"] = session_id
-        doc.metadata["source"] = url # Taake pata chale ye data kahan se aaya
-        doc.metadata["type"] = "web_scrape"
-    # 4. Save to Qdrant
-    try:
-        vector_store = get_vector_store()
         await vector_store.aadd_documents(split_docs)
-        print(f"SUCCESS: [Ingestion] Processed {len(split_docs)} chunks from {url}")
         return len(split_docs)
     except Exception as e:
-        print(f"ERROR: [Ingestion] Failed to upload to Qdrant: {e}")
         return 0

 import asyncio
+import json
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
 from langchain_community.document_loaders import WebBaseLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from backend.src.services.vector_store.qdrant_adapter import get_vector_store
+from backend.src.models.integration import UserIntegration # SaaS Logic ke liye
+async def process_url(url: str, session_id: str, user_id: str, db: AsyncSession):
     """
+    SaaS Skill: Scrapes a URL strictly into the USER'S personal Cloud Qdrant.
     """
+    print(f"INFO: [Ingestion] Verifying Database for User {user_id} before scraping: {url}")
     try:
+        # 1. PEHLA KAAM: Database Verification (No Key, No Scrape)
+        stmt = select(UserIntegration).where(
+            UserIntegration.user_id == str(user_id),
+            UserIntegration.provider == "qdrant",
+            UserIntegration.is_active == True
+        )
+        result = await db.execute(stmt)
+        integration = result.scalars().first()
+        if not integration:
+            print(f"❌ ERROR: User {user_id} has no Qdrant connected.")
+            return -1 # 'No Database' code for the API to handle
+        # 2. Extract User's Secret Credentials
+        creds = json.loads(integration.credentials) if isinstance(integration.credentials, str) else integration.credentials
+        # 3. Secure Connection to Cloud (Passing credentials)
+        vector_store = get_vector_store(credentials=creds)
+        # 4. Load Data from URL (Async Thread)
         def load_data():
             loader = WebBaseLoader(url)
             return loader.load()
             print(f"WARNING: [Ingestion] No content found at {url}")
             return 0
+        print(f"INFO: [Ingestion] Scrape Success. Content Length: {len(docs[0].page_content)} chars.")
+        # 5. Text Splitting (Chunks)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len
+        )
+        split_docs = text_splitter.split_documents(docs)
+        # 6. Add Strict Metadata for Multi-tenancy
+        for doc in split_docs:
+            doc.metadata["session_id"] = session_id
+            doc.metadata["user_id"] = user_id # Zaroori: Taake chat sirf apna data dhoonde
+            doc.metadata["source"] = url
+            doc.metadata["type"] = "web_scrape"
+        # 7. Upload to User's Vector DB
         await vector_store.aadd_documents(split_docs)
+        print(f"SUCCESS: [Ingestion] {len(split_docs)} chunks synced to User's Cloud Database.")
         return len(split_docs)
     except Exception as e:
+        print(f"ERROR: [Ingestion] Processing failed for {url}: {e}")
         return 0

backend/src/services/ingestion/zip_processor.py CHANGED Viewed

@@ -2,44 +2,81 @@ import zipfile
 import os
 import shutil
 import asyncio
 from sqlalchemy.ext.asyncio import AsyncSession
 from backend.src.models.ingestion import IngestionJob, JobStatus
 from backend.src.services.ingestion.file_processor import process_file
 from backend.src.services.vector_store.qdrant_adapter import get_vector_store
 from qdrant_client.http import models
-# --- CONFIGURATION ---
 SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md', '.docx', '.csv']
 MAX_FILES_IN_ZIP = 500
 class SmartZipProcessor:
-    def __init__(self, job_id: int, zip_path: str, session_id: str, db: AsyncSession):
         self.job_id = job_id
         self.zip_path = zip_path
         self.session_id = session_id
         self.db = db
-        self.vector_store = get_vector_store()
         self.temp_dir = f"./temp_unzip_{job_id}"
         self.report = []
     async def log_status(self, status: str, processed=0, total=0, error=None):
-        """Database mein job status update karta hai"""
         try:
-            job = await self.db.get(IngestionJob, self.job_id)
             if job:
                 job.status = status
                 job.items_processed = processed
                 job.total_items = total
-                job.details = self.report # Report bhi save karo
                 if error:
                     job.error_message = str(error)
                 await self.db.commit()
         except Exception as e:
             print(f"DB Log Error: {e}")
     async def clean_existing_data(self):
-        """Update Logic: Is session ka purana data saaf karo"""
-        print(f"INFO: Cleaning old data for session_id: {self.session_id}")
         try:
             self.vector_store.client.delete(
                 collection_name=self.vector_store.collection_name,
@@ -49,69 +86,76 @@ class SmartZipProcessor:
                             models.FieldCondition(
                                 key="metadata.session_id",
                                 match=models.MatchValue(value=self.session_id)
                             )
                         ]
                     )
                 )
             )
         except Exception as e:
-            print(f"Warning: Clean data failed (maybe first upload): {e}")
     def inspect_zip(self) -> list:
-        """Zip ko bina extract kiye check karta hai"""
         with zipfile.ZipFile(self.zip_path, 'r') as zf:
             file_list = zf.infolist()
-            # Guardrail 1: File Count
             if len(file_list) > MAX_FILES_IN_ZIP:
-                raise ValueError(f"Zip contains too many files ({len(file_list)}). Max allowed is {MAX_FILES_IN_ZIP}.")
-            # Sirf "Files" return karo, folders nahi
             return [f for f in file_list if not f.is_dir()]
     def extract_zip(self):
-        """Zip ko temp folder mein extract karta hai"""
         os.makedirs(self.temp_dir, exist_ok=True)
         with zipfile.ZipFile(self.zip_path, 'r') as zf:
             zf.extractall(self.temp_dir)
     def cleanup(self):
-        """Temp files/folders delete karta hai"""
         if os.path.exists(self.temp_dir):
             shutil.rmtree(self.temp_dir)
         if os.path.exists(self.zip_path):
             os.remove(self.zip_path)
     async def start(self):
-        """Main Processing Loop"""
         try:
-            # Step 1: Inspect
             files_to_process = self.inspect_zip()
             total_files = len(files_to_process)
             await self.log_status(JobStatus.PROCESSING, total=total_files)
-            # Step 2: Clean old data (Atomic Update)
             await self.clean_existing_data()
-            # Step 3: Extract
             self.extract_zip()
-            # Step 4: Process each file
             processed_count = 0
             for file_info in files_to_process:
                 file_path = os.path.join(self.temp_dir, file_info.filename)
-                # Guardrail 2: Supported Extension
                 ext = os.path.splitext(file_path)[1].lower()
                 if ext not in SUPPORTED_EXTENSIONS:
                     self.report.append({"file": file_info.filename, "status": "skipped", "reason": "unsupported_type"})
                     continue
-                # Process the file
                 try:
-                    # process_file (jo humne pehle banaya tha) ko call karo
-                    chunks_added = await process_file(file_path, self.session_id)
-                    if chunks_added > 0:
                         self.report.append({"file": file_info.filename, "status": "success", "chunks": chunks_added})
                     else:
                         raise ValueError("No content extracted")
@@ -120,10 +164,10 @@ class SmartZipProcessor:
                 processed_count += 1
                 await self.log_status(JobStatus.PROCESSING, processed=processed_count, total=total_files)
-                await asyncio.sleep(0.1) # Thoda saans lene do
             await self.log_status(JobStatus.COMPLETED, processed=processed_count, total=total_files)
-            print(f"SUCCESS: Zip processing finished. Processed {processed_count}/{total_files} files.")
         except Exception as e:
             print(f"ERROR: Zip processing failed: {e}")

 import os
 import shutil
 import asyncio
+import json
 from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
 from backend.src.models.ingestion import IngestionJob, JobStatus
+from backend.src.models.integration import UserIntegration # SaaS Logic
 from backend.src.services.ingestion.file_processor import process_file
 from backend.src.services.vector_store.qdrant_adapter import get_vector_store
 from qdrant_client.http import models
 SUPPORTED_EXTENSIONS = ['.pdf', '.txt', '.md', '.docx', '.csv']
 MAX_FILES_IN_ZIP = 500
 class SmartZipProcessor:
+    # 1. Init mein 'user_id' add kiya
+    def __init__(self, job_id: int, zip_path: str, session_id: str, db: AsyncSession, user_id: str):
         self.job_id = job_id
         self.zip_path = zip_path
         self.session_id = session_id
         self.db = db
+        self.user_id = user_id # Owner ID
+        self.vector_store = None # Verification ke baad initialize hoga
         self.temp_dir = f"./temp_unzip_{job_id}"
         self.report = []
     async def log_status(self, status: str, processed=0, total=0, error=None):
         try:
+            # SQL Alchemy 2.0 style query
+            result = await self.db.execute(select(IngestionJob).where(IngestionJob.id == self.job_id))
+            job = result.scalars().first()
             if job:
                 job.status = status
                 job.items_processed = processed
                 job.total_items = total
+                job.details = self.report
                 if error:
                     job.error_message = str(error)
                 await self.db.commit()
         except Exception as e:
             print(f"DB Log Error: {e}")
+    # --- NEW: SaaS DATABASE VERIFICATION ---
+    async def verify_and_connect_db(self) -> bool:
+        """
+        ZIP processing se pehle check karo ke user ka Qdrant Cloud connected hai ya nahi.
+        """
+        print(f"🔍 Verifying Database for ZIP Processing. User ID: {self.user_id}")
+        try:
+            stmt = select(UserIntegration).where(
+                UserIntegration.user_id == str(self.user_id),
+                UserIntegration.provider == "qdrant",
+                UserIntegration.is_active == True
+            )
+            result = await self.db.execute(stmt)
+            integration = result.scalars().first()
+            if not integration:
+                error_msg = "❌ No Qdrant Cloud connected. Cannot process ZIP."
+                await self.log_status(JobStatus.FAILED, error=error_msg)
+                return False
+            # Extract Credentials
+            creds = json.loads(integration.credentials) if isinstance(integration.credentials, str) else integration.credentials
+            # Smart Adapter ko user ki chabiyan bhejien (No Fallback)
+            self.vector_store = get_vector_store(credentials=creds)
+            return True
+        except Exception as e:
+            print(f"❌ DB Verification Failed: {e}")
+            return False
     async def clean_existing_data(self):
+        """SaaS Logic: Sirf is session aur is user ka purana data delete karo"""
+        print(f"INFO: Cleaning old data for session: {self.session_id}")
         try:
             self.vector_store.client.delete(
                 collection_name=self.vector_store.collection_name,
                             models.FieldCondition(
                                 key="metadata.session_id",
                                 match=models.MatchValue(value=self.session_id)
+                            ),
+                            # SECURITY: Ensure we only delete THIS user's data
+                            models.FieldCondition(
+                                key="metadata.user_id",
+                                match=models.MatchValue(value=str(self.user_id))
                             )
                         ]
                     )
                 )
             )
         except Exception as e:
+            print(f"Warning: Clean data failed: {e}")
     def inspect_zip(self) -> list:
         with zipfile.ZipFile(self.zip_path, 'r') as zf:
             file_list = zf.infolist()
             if len(file_list) > MAX_FILES_IN_ZIP:
+                raise ValueError(f"Zip too large: {len(file_list)} files.")
             return [f for f in file_list if not f.is_dir()]
     def extract_zip(self):
         os.makedirs(self.temp_dir, exist_ok=True)
         with zipfile.ZipFile(self.zip_path, 'r') as zf:
             zf.extractall(self.temp_dir)
     def cleanup(self):
         if os.path.exists(self.temp_dir):
             shutil.rmtree(self.temp_dir)
         if os.path.exists(self.zip_path):
             os.remove(self.zip_path)
     async def start(self):
         try:
+            # 1. PEHLA KAAM: Database check
+            db_ready = await self.verify_and_connect_db()
+            if not db_ready: return
             files_to_process = self.inspect_zip()
             total_files = len(files_to_process)
             await self.log_status(JobStatus.PROCESSING, total=total_files)
+            # 2. Atomic Clean
             await self.clean_existing_data()
+            # 3. Extract
             self.extract_zip()
+            # 4. Loop through files
             processed_count = 0
             for file_info in files_to_process:
                 file_path = os.path.join(self.temp_dir, file_info.filename)
                 ext = os.path.splitext(file_path)[1].lower()
                 if ext not in SUPPORTED_EXTENSIONS:
                     self.report.append({"file": file_info.filename, "status": "skipped", "reason": "unsupported_type"})
                     continue
                 try:
+                    # process_file (jo humne pehle update kiya tha) ko call karo
+                    # Ab isko 'user_id' aur 'db' session bhi bhej rahe hain 🚀
+                    chunks_added = await process_file(
+                        file_path=file_path,
+                        session_id=self.session_id,
+                        user_id=self.user_id,
+                        db=self.db
+                    )
+                    if chunks_added == -1: # No Database error from process_file
+                        raise ValueError("Database connection lost or not configured.")
+                    elif chunks_added > 0:
                         self.report.append({"file": file_info.filename, "status": "success", "chunks": chunks_added})
                     else:
                         raise ValueError("No content extracted")
                 processed_count += 1
                 await self.log_status(JobStatus.PROCESSING, processed=processed_count, total=total_files)
+                await asyncio.sleep(0.05)
             await self.log_status(JobStatus.COMPLETED, processed=processed_count, total=total_files)
+            print(f"SUCCESS: Secure Zip ingestion complete.")
         except Exception as e:
             print(f"ERROR: Zip processing failed: {e}")

backend/src/services/vector_store/qdrant_adapter.py CHANGED Viewed

@@ -1,78 +1,50 @@
-import qdrant_client
 from qdrant_client import QdrantClient
 from qdrant_client.http import models
 from langchain_qdrant import QdrantVectorStore
-from backend.src.core.config import settings
 from backend.src.services.embeddings.factory import get_embedding_model
-from typing import Optional, Dict
-# @lru_cache() HATA DIYA - We can't cache user-specific connections
-def get_vector_store(credentials: Optional[Dict[str, str]] = None):
     """
-    Dynamic Vector Store Connector.
-    1. Agar 'credentials' hain, to unhein use karega (User's Cloud Qdrant).
-    2. Agar nahi, to global settings use karega (Fallback/Admin).
     """
-    embedding_model = get_embedding_model() # Ye local hai, isko keys nahi chahiye
-    # --- DYNAMIC CONFIGURATION LOGIC ---
-    if credentials:
-        # User-specific Cloud settings
-        qdrant_url = credentials.get("url")
-        qdrant_api_key = credentials.get("api_key")
-        collection_name = credentials.get("collection_name", "user_default_collection")
-    else:
-        # Global fallback settings
-        qdrant_url = settings.QDRANT_URL
-        qdrant_api_key = settings.QDRANT_API_KEY
-        collection_name = settings.QDRANT_COLLECTION_NAME
-    if not qdrant_url:
-        raise ValueError("Qdrant URL is not configured for this user or globally.")
-    print(f"INFO: [VectorDB] Connecting to Qdrant at '{qdrant_url}'...")
-    # 1. Qdrant Client banayen (User ki keys ke sath)
-    client = QdrantClient(
-        url=qdrant_url,
-        api_key=qdrant_api_key,
-    )
-    # 2. CHECK: Kya Collection exist karti hai?
-    # Hum 'try-except' use karenge taake connection errors bhi pakde jayen
     try:
-        # collection_exists is deprecated, use get_collection instead
-        client.get_collection(collection_name=collection_name)
-        print(f"INFO: [VectorDB] Collection '{collection_name}' already exists.")
-    except Exception as e:
-        # Agar error "Not found" hai, to collection banayenge
-        if "404" in str(e) or "Not found" in str(e):
-            print(f"INFO: Collection '{collection_name}' not found. Creating it now...")
-            # Embedding size pata karna
-            dummy_embedding = embedding_model.embed_query("test")
-            vector_size = len(dummy_embedding)
             client.create_collection(
                 collection_name=collection_name,
-                vectors_config=models.VectorParams(
-                    size=vector_size,
-                    distance=models.Distance.COSINE
-                )
             )
-            print(f"SUCCESS: Created collection '{collection_name}' with vector size {vector_size}.")
-        else:
-            # Koi aur error (e.g., connection refused)
-            raise ConnectionError(f"Failed to connect or access Qdrant: {e}")
-    # 3. Vector Store object bana kar return karein
-    vector_store = QdrantVectorStore(
-        client=client,
-        collection_name=collection_name,
-        embedding=embedding_model,
-        content_payload_key="page_content",
-        metadata_payload_key="metadata"
-    )
-    return vector_store

+# backend/src/services/vector_store/qdrant_adapter.py
 from qdrant_client import QdrantClient
 from qdrant_client.http import models
 from langchain_qdrant import QdrantVectorStore
 from backend.src.services.embeddings.factory import get_embedding_model
+from typing import Dict
+def get_vector_store(credentials: Dict[str, str]):
     """
+    Strict SaaS Vector Store Connector.
+    NO GLOBAL FALLBACK. User MUST provide their own Cloud Qdrant.
     """
+    if not credentials or not credentials.get("url"):
+        # Yeh error seedha user ko dikhayi dega
+        raise ValueError("Database Connection Missing: Please connect your Qdrant Cloud in 'User Settings' first.")
+    qdrant_url = credentials.get("url")
+    qdrant_api_key = credentials.get("api_key")
+    collection_name = credentials.get("collection_name", "user_default_collection")
+    # Cloud Check: Ensure HTTPS
+    if "cloud.qdrant.io" in qdrant_url and not qdrant_url.startswith("https://"):
+        qdrant_url = f"https://{qdrant_url}"
+    print(f"📡 [VectorDB] Strictly connecting to User Database: {qdrant_url}")
     try:
+        client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key, timeout=30)
+        # Collection check/create logic
+        try:
+            client.get_collection(collection_name=collection_name)
+        except Exception:
+            print(f"Creating new collection: {collection_name}")
+            embedding_model = get_embedding_model()
+            vector_size = len(embedding_model.embed_query("test"))
             client.create_collection(
                 collection_name=collection_name,
+                vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE)
             )
+        return QdrantVectorStore(
+            client=client,
+            collection_name=collection_name,
+            embedding=get_embedding_model(),
+            content_payload_key="page_content",
+            metadata_payload_key="metadata"
+        )
+    except Exception as e:
+        raise ConnectionError(f"Qdrant Connection Failed: {str(e)}")