Spaces:

itsmehardawood
/

arabic-chatbot

Sleeping

App Files Files Community

itsmehardawood commited on Jun 2, 2025

Commit

a249293

1 Parent(s): 3fe718a

Deployment

Browse files

Files changed (7) hide show

.env +5 -0
App/__pycache__/main.cpython-312.pyc +0 -0
App/__pycache__/utils.cpython-312.pyc +0 -0
App/main.py +1061 -0
App/utils.py +376 -0
Dockerfile +12 -0
requirements.txt +179 -0

.env ADDED Viewed

	@@ -0,0 +1,5 @@

+PAYPAL_CLIENT_ID = "AQL2cqJbTd3yoVyOk7fGhwsvH0vfZKi8jHz0RlA40ZyZ0N8pJZZ_A7KuzxMr7w6oiKDGlz44EYVl05qs"
+PAYPAL_CLIENT_ID_sn = "AawVJHHptCzgOKPLacd0VOhziveuy63nA7M35HT--9U5BgkWzixkQSI4vRUvPlMnVG8nGocn8muw6QAC"
+PAYPAL_CLIENT_SECRET = "ENhmJb_su9sjwh5ti4yLkLwW3GR0SYjTVrvZHjecBwM_DPoVWJX1GUaT6Nb2TGj-eEUXBqREFSYQiIDt"
+PAYPAL_CLIENT_SECRET_sn = "ECXLFuvC5aAgC5wDFhSM1CTXh4M0MIJ4UsV6CagqZlp5FY9cGY1qNql7-dmaylp4mG0WqC3M-wUsqvgM"
+PAYPAL_ENVIRONMENT = "sandbox"

App/__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (39.1 kB). View file

App/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (14.2 kB). View file

App/main.py ADDED Viewed

	@@ -0,0 +1,1061 @@

+import os
+import uuid
+import base64
+import bcrypt
+from datetime import datetime, timedelta , timezone
+import io
+from typing import Optional , Literal
+from fastapi import FastAPI, UploadFile, File, HTTPException, Depends, Query as QueryParam , Request
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, EmailStr, Field , HttpUrl
+from jose import jwt
+from motor.motor_asyncio import AsyncIOMotorClient
+from bson.binary import Binary
+from fastapi import status
+# from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound
+from App.utils import (
+    load_and_split_bytes,
+    add_documents_to_index,
+    get_llm,
+    get_existing_retriever,
+    get_collection_stats ,
+    build_chroma_index ,
+    create_rag_chain_with_history ,
+    count_tokens ,
+    search_youtube_video ,
+    cosine_similarity
+)
+from bson import ObjectId
+import openai
+import numpy as np
+import logging
+from langchain_openai import OpenAIEmbeddings
+import os
+from dotenv import load_dotenv
+from paypalcheckoutsdk.core import PayPalHttpClient, SandboxEnvironment, LiveEnvironment
+from paypalcheckoutsdk.orders import OrdersCreateRequest, OrdersCaptureRequest
+load_dotenv()
+env_name = os.getenv("PAYPAL_ENVIRONMENT", "sandbox")
+creds = dict(
+  client_id = os.getenv("PAYPAL_CLIENT_ID_sn"),
+  client_secret = os.getenv("PAYPAL_CLIENT_SECRET_sn"),
+)
+environment = (
+    LiveEnvironment(**creds)
+    if env_name == "live"
+    else SandboxEnvironment(**creds)
+)
+p_client = PayPalHttpClient(environment)
+# Configuration for JWT
+SECRET_KEY = "hssjhdahsd"
+ALGORITHM = "HS256"
+ACCESS_TOKEN_EXPIRE_MINUTES = 30
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*" , 'http://localhost:3000'],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# MongoDB connection
+connection_string = os.getenv(
+    "MONGODB_URI",
+    "mongodb+srv://ahmed0499280:haseeb.2003@cluster0.hzgrxp2.mongodb.net/"
+    "?retryWrites=true&w=majority&appName=Cluster0"
+)
+client = AsyncIOMotorClient(connection_string)
+db = client["Cluster0"]
+users_collection = db["users"]
+chatbot_history_collection = db["chatbothistory"]
+documents_collection = db["documents"]  # Collection to store document files
+chroma_db_collection = db["chroma_db_store"]  # Collection to store Chroma DB
+flash_cards = db['flash_cards']
+videos_collection = db['videos']
+orders_collection = db["orders"]
+subscriptions_collection = db["subscriptions"]
+# Default language setting
+language = None
+# Pydantic models
+class CreateSubOrderSchema(BaseModel):
+    user_id: str
+    plan: Literal["monthly", "yearly"]
+class TrialResponse(BaseModel):
+    status: str
+    expires: datetime
+class OrderResponse(BaseModel):
+    order_id: str
+    status: str
+class SubscriptionRequest(BaseModel):
+    user_id: str
+    plan: str
+class SearchRequest(BaseModel):
+    query: str
+class Video(BaseModel):
+    link: str
+    description: str
+class LanguageRequest(BaseModel):
+    language: str
+class QueryModel(BaseModel):
+    question: str
+    user_id: str
+    diacritics:bool
+    level : str
+class UserSignup(BaseModel):
+    username: str = Field(..., min_length=3, max_length=50)
+    email: EmailStr
+    password: str = Field(..., min_length=8)
+    language: str
+    is_admin : bool
+class UserModel(BaseModel):
+    id: str
+    username: str
+    email: str
+    language: str
+    is_admin: bool
+    password: Optional[str] = None
+class UserResponse(BaseModel):
+    id: str
+    username: str
+    email: str
+class UserLogin(BaseModel):
+    email: EmailStr
+    password: str
+class Token(BaseModel):
+    access_token: str
+    token_type: str
+class DocumentInfo(BaseModel):
+    filename: str
+    upload_date: datetime
+    file_size: int
+    chunks: int
+    user_id: str
+class FlashcardSaveModel(BaseModel):
+    user_id: str
+    question: str
+    answer: str
+# Helper for login
+async def authenticate_user(email: str, password: str):
+    user = await users_collection.find_one({"email": email})
+    if not user or not bcrypt.checkpw(password.encode(), user["password"].encode()):
+        return None
+    return user
+@app.get("/users/{user_id}", response_model=UserModel)
+async def get_user(user_id: str):
+    """
+    Fetch a single user by their user_id.
+    Supports lookup by MongoDB ObjectId or string _id.
+    """
+    # Build query: if user_id is a valid ObjectId, use that, else match string _id
+    try:
+        obj_id = ObjectId(user_id)
+        query = {"_id": obj_id}
+    except Exception:
+        query = {"_id": user_id}
+    user = await users_collection.find_one(query)
+    if not user:
+        raise HTTPException(status_code=404, detail="User not found")
+    # Prepare response: convert _id to id
+    user_data = {
+        "id": str(user.get("_id")),
+        "username": user.get("username"),
+        "email": user.get("email"),
+        "language": user.get("language"),
+        "is_admin": user.get("is_admin", False),
+    }
+    return user_data
+# Initialize retriever at startup
+@app.on_event("startup")
+async def startup_event():
+    # Retrieve existing retriever if it exists
+    try:
+        retriever = get_existing_retriever()
+        if retriever:
+            app.state.retriever = retriever
+            stats = get_collection_stats()
+            print(f"Loaded existing vector database with {stats['document_count']} document chunks")
+        else:
+            app.state.retriever = None
+            print("No existing vector database found. Will create when documents are uploaded.")
+    except Exception as e:
+        print(f"Error loading vector database: {e}")
+        app.state.retriever = None
+### API Endpoints
+# To get Language
+@app.post("/language")
+async def receive_language(req: LanguageRequest):
+    global language
+    language = req.language
+    return {'Message': f"Language is Selected to '{language}'"}
+# To SignUp
+@app.post("/signup", response_model=UserResponse, tags=["auth"])
+async def signup(user: UserSignup):
+    if await users_collection.find_one({"email": user.email}):
+        raise HTTPException(status_code=400, detail="Email already registered")
+    salt = bcrypt.gensalt()
+    hashed = bcrypt.hashpw(user.password.encode(), salt).decode()
+    user_id = str(uuid.uuid4())
+    await users_collection.insert_one({
+        "_id": user_id,
+        "username": user.username,
+        "email": user.email,
+        "password": hashed,
+        "language": user.language ,
+        "is_admin" : False
+    })
+    print(user.language)
+    return {"id": user_id, "username": user.username, "email": user.email}
+# TO Login
+@app.post("/login", response_model=Token, tags=["auth"])
+async def login(credentials: UserLogin):
+    user = await authenticate_user(credentials.email, credentials.password)
+    if not user:
+        raise HTTPException(status_code=401, detail="Invalid email or password")
+    expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
+    to_encode = {"sub": user["_id"], "exp": expire}
+    token = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
+    return {"access_token": token, "token_type": "bearer"}
+# To Add Document to RAG - MongoDB storage for both document and Chroma
+@app.post("/build_rag", tags=["rag"])
+async def build_rag_endpoint(file: UploadFile = File(...), user_id: str = QueryParam()):
+    if not file.filename.endswith(".docx"):
+        raise HTTPException(status_code=400, detail="Only .docx files are supported.")
+    try:
+        # Read the file into memory
+        file_content = await file.read()
+        file_size = len(file_content)
+        # Generate a unique document ID
+        doc_id = str(uuid.uuid4())
+        # Process the document for RAG
+        temp_file = io.BytesIO(file_content)
+        docs = load_and_split_bytes(temp_file)
+        # Store document in MongoDB
+        doc_info = {
+            "_id": doc_id,
+            "filename": file.filename,
+            "upload_date": datetime.utcnow(),
+            "file_size": file_size,
+            "chunks": len(docs),
+            "user_id": user_id,
+            "file_content": Binary(file_content)  # Store as Binary BSON type
+        }
+        await documents_collection.insert_one(doc_info)
+        # Add to or update the Chroma vector store and save to MongoDB
+        collection_name = "default"
+        app.state.retriever = add_documents_to_index(docs, collection_name=collection_name)
+        # Get updated stats
+        stats = get_collection_stats(collection_name)
+        return {
+            "message": f"File '{file.filename}' added to knowledge base.",
+            "document_id": doc_id,
+            "total_chunks_in_db": stats["document_count"]
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
+# To query RAG
+@app.post("/query_rag", tags=["rag"])
+async def query_rag_endpoint(payload: QueryModel):
+    now = datetime.now(timezone.utc)
+    subscription = await subscriptions_collection.find_one({
+        "user_id": payload.user_id,
+        "status": "active"
+    })
+    if not subscription:
+        raise HTTPException(status_code=403, detail="No active subscription found.")
+    end_date = subscription.get("end_date")
+    if end_date and end_date.tzinfo is None:
+        end_date = end_date.replace(tzinfo=timezone.utc)
+    if end_date and now > end_date:
+        # Update status to expired
+        await subscriptions_collection.update_one(
+            {"_id": subscription["_id"]},
+            {"$set": {"status": "expired"}}
+        )
+        raise HTTPException(status_code=403, detail="Subscription expired.")
+    llm = get_llm()
+    cl_promp =f'''You are a binary intent classifier. When given a user message, decide if the user is asking to play or watch a video—explicitly or implicitly.
+    Instructions:
+        1. Analyze the provided message (`{payload.question}`) to determine intent:
+            - If the user intends to start, resume, or watch a video (e.g. “play the video now,” “let me see that demo,” “could you launch the tutorial clip?”), choose:
+                play_video
+            - Otherwise, choose:
+                other
+        2. Output exactly one label (play_video or other), with no additional text, punctuation, or formatting.
+        Input:
+        {{message}}
+        Output:'''
+    cl = llm.invoke(cl_promp).content
+    print(cl)
+    if cl == 'play_video':
+        try:
+            # 1) Fetch up to 1000 docs (just the link+description)
+            docs = await videos_collection.find(
+                {},
+                {"link": 1, "description": 1, "_id": 0}
+            ).to_list(length=1000)
+            if not docs:
+                raise HTTPException(status_code=404, detail="No videos found")
+            # 2) Get descriptions as a list
+            descriptions = [doc["description"] for doc in docs]
+            # 3) Embed query using LangChain's OpenAIEmbeddings
+            query_embedding = embeddings_model.embed_query(payload.question)
+            # 4) Embed all descriptions (this is still inefficient but uses LangChain)
+            description_embeddings = embeddings_model.embed_documents(descriptions)
+            # 5) Compute similarities
+            similarities = [cosine_similarity(query_embedding, desc_emb)
+                            for desc_emb in description_embeddings]
+            # 6) Pick best match
+            best_idx = int(np.argmax(similarities))
+            best_doc = docs[best_idx]
+            print(similarities[best_idx])
+            if similarities[best_idx] > 0.6:
+                return {
+                    "answer" : "Here is video Based on your query",
+                    "youtube": {
+                    "embed_url": best_doc["link"],
+                    "watch_url": ""
+                },
+                "transcript": ""
+                }
+            else:
+                return {
+                    "answer" : "No Video found" ,
+                    "youtube": {
+                    "embed_url": "",
+                    "watch_url": ""
+                },
+                "transcript": ""
+                }
+        except Exception as e:
+            logging.error(f"Error during video search: {str(e)}")
+            raise HTTPException(status_code=500, detail="Error processing search request")
+        # t_prompt = f'''You are a concise title generator. When given a user’s message, produce a short, keyword‑rich title that captures the core topic or intent—optimized for searching on YouTube.
+        #     Instructions:
+        #     1. Read the provided message (`{payload.question}`).
+        #     2. Extract its main subject or action.
+        #     3. Craft a clear, descriptive title (5–8 words) suitable as a YouTube search query.
+        #     4. Output **exactly** the title, with no extra text or punctuation.
+        #     Input:
+        #     {{message}}
+        #     Output:
+        #         '''
+        # message_title = llm.invoke(t_prompt).content
+        # print(message_title)
+        # try:
+        #     vid = search_youtube_video(message_title)
+        #     try:
+        #         transcript_data = YouTubeTranscriptApi.get_transcript(vid, languages=['en'])
+        #     # transcript_data is a list of { "text": "...", "start": 12.34, "duration": 2.5 }
+        #     except NoTranscriptFound:
+        #         transcript_data = []
+        # except Exception:
+        #     raise HTTPException(status_code=404, detail="Could not find a matching YouTube video.")
+        # embed_url = f"https://www.youtube.com/embed/{vid}"
+        # watch_url = f"https://www.youtube.com/watch?v={vid}"
+        # return {
+        #     "answer": f"Sure—here’s what I found on YouTube for your request:",
+        #     "youtube": {
+        #         "embed_url": embed_url,
+        #         "watch_url": watch_url
+        #     },
+        #     "transcript": transcript_data
+        # }
+    else:
+        # Use app.state.retriever which is initialized at startup or updated when docs are added
+        if not hasattr(app.state, "retriever") or app.state.retriever is None:
+            # Check if vector store exists but hasn't been loaded
+            retriever = get_existing_retriever()
+            if retriever:
+                app.state.retriever = retriever
+            else:
+                raise HTTPException(
+                    status_code=400,
+                    detail="No documents have been uploaded. Please upload documents using /build_rag first."
+                )
+        # Get user language preference
+        if language is None:
+            user_id = payload.user_id
+            user = await users_collection.find_one({"_id": user_id})
+            if user:
+                user_lan = user.get("language")
+            # else:
+            #     user_lan = "Arabic"  # Default to English
+        else:
+            user_lan = language
+        question_tokens = count_tokens(payload.question)
+        print(f"Question tokens: {question_tokens}")
+        retrieved_docs = app.state.retriever.get_relevant_documents(payload.question)
+        context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
+        context_tokens = count_tokens(context_text)
+        chat_history = await chatbot_history_collection.find_one({"userId": payload.user_id})
+        previous_messages = []
+        if chat_history and "messages" in chat_history:
+            history_limit = 10
+            recent_messages = chat_history["messages"][-history_limit:]
+            for msg in recent_messages:
+                previous_messages.append({"role": "human", "content": msg["user_message"]})
+                previous_messages.append({"role": "assistant", "content": msg["ai_response"]})
+        history_text = ""
+        for msg in previous_messages:
+            history_text += f"{msg['role']}: {msg['content']}\n"
+        history_tokens = count_tokens(history_text)
+        print(f"History tokens: {history_tokens}")
+        # Create RAG chain and generate response
+        llm = get_llm()
+        print("Diacritics : " , payload.diacritics)
+        print("Language : " ,user_lan)
+        rag_chain = create_rag_chain_with_history(app.state.retriever, llm, user_lan,payload.level ,payload.diacritics, previous_messages)
+        response = rag_chain.invoke({"input": payload.question})
+        response_tokens = count_tokens(response['answer'])
+        print(f"Response tokens: {response_tokens}")
+        print(f"Total tokens: {question_tokens + history_tokens + context_tokens}")
+        # Record chat history
+        msg_record = {
+            "user_message": payload.question,
+            "ai_response": response['answer'],
+            "timestamp": datetime.utcnow()
+        }
+        await chatbot_history_collection.update_one(
+            {"userId": payload.user_id},
+            {"$push": {"messages": msg_record}},
+            upsert=True
+        )
+        return {
+            "answer": response["answer"] ,
+            "youtube": {
+                "embed_url": "",
+                "watch_url": ""
+            },
+            "transcript": "" }
+@app.post("/save_flashcard", tags=["flashcards"])
+async def save_flashcard(payload: FlashcardSaveModel):
+    """
+    Save a chat exchange as a flashcard with LLM-generated title.
+    """
+    try:
+        # Generate title using LLM
+        # Get LLM instance
+        llm = get_llm()
+        # Create a prompt for title generation
+        title_prompt = f"""
+        Based on the following question and answer, generate a concise, descriptive title (15 words or less)
+        that captures the main topic or key insight. Make it specific enough that someone can understand
+        what information they'll find in this flashcard.
+        Question: {payload.question}
+        Answer: {payload.answer}
+        Title:
+        """
+        # Generate title using LLM
+        title_response = llm.invoke(title_prompt)
+        # Clean up the response (remove any quotation marks, extra spaces, etc.)
+        title = title_response.content.strip().strip('"\'').strip()
+        # If LLM fails to generate a good title, fall back to question snippet
+        if not title or len(title) > 100:
+            title = payload.question[:50] + ("..." if len(payload.question) > 50 else "")
+        # Create the flashcard document
+        flashcard_doc = {
+            "_id": str(uuid.uuid4()),
+            "user_id": payload.user_id,
+            "title": title,
+            "question": payload.question,
+            "answer": payload.answer,
+            "created_at": datetime.utcnow()
+        }
+        # Insert into flash_cards collection
+        await flash_cards.insert_one(flashcard_doc)
+        return {
+            "success": True,
+            "flashcard_id": flashcard_doc["_id"],
+            "title": title,  # Return the generated title to the frontend
+            "message": "Flashcard saved successfully"
+        }
+    except Exception as e:
+        print(f"Error saving flashcard: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error saving flashcard: {str(e)}")
+# Save Flash Cards
+@app.get("/flashcards/{user_id}", tags=["flashcards"])
+async def get_user_flashcards(user_id: str):
+    """
+    Retrieve all flashcards for a specific user.
+    """
+    try:
+        cursor = flash_cards.find({"user_id": user_id}).sort("created_at", -1)
+        flashcards = []
+        async for card in cursor:
+            # Convert ObjectId to string if needed
+            card["_id"] = str(card["_id"])
+            flashcards.append(card)
+        return {"user_id": user_id, "flashcards": flashcards}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error retrieving flashcards: {str(e)}")
+# Delete FlashCards
+@app.delete("/flashcards/{flashcard_id}", tags=["flashcards"])
+async def delete_flashcard(flashcard_id: str, user_id: str = QueryParam()):
+    """
+    Delete a specific flashcard. Requires user_id to verify ownership.
+    """
+    try:
+        result = await flash_cards.delete_one({
+            "_id": flashcard_id,
+            "user_id": user_id
+        })
+        if result.deleted_count == 0:
+            raise HTTPException(status_code=404, detail="Flashcard not found or not owned by this user")
+        return {"success": True, "message": "Flashcard deleted successfully"}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error deleting flashcard: {str(e)}")
+# For Video
+@app.post("/add_video")
+async def add_video(video: Video):
+    result = await videos_collection.insert_one({
+        "link": video.link,
+        "description": video.description
+    })
+    if result.inserted_id:
+        return {"message": "Video added successfully", "id": str(result.inserted_id)}
+    raise HTTPException(500, "Failed to add video")
+# To Delete a document
+@app.delete("/documents/{document_id}", tags=["rag"], status_code=status.HTTP_204_NO_CONTENT)
+async def delete_document(document_id: str):
+    # 1. Delete the document metadata + file bytes
+    result = await documents_collection.delete_one({"_id": document_id})
+    if result.deleted_count == 0:
+        raise HTTPException(status_code=404, detail="Document not found")
+    # 2. Rebuild the RAG index from all remaining documents
+    #    - Fetch all stored documents
+    cursor = documents_collection.find({})
+    all_docs = []
+    async for doc in cursor:
+        # load_and_split_bytes expects a file-like, so wrap bytes in BytesIO
+        from io import BytesIO
+        chunks = load_and_split_bytes(BytesIO(doc["file_content"]))
+        all_docs.extend(chunks)
+    # 3. If there are any chunks left, rebuild; otherwise clear retriever
+    if all_docs:
+        # This will re-create your Chroma index and persist it to Mongo
+        app.state.retriever = build_chroma_index(
+            all_docs,
+            collection_name="default"
+        )
+    else:
+        # No documents → clear both in-memory and persisted vector store
+        app.state.retriever = None
+        await chroma_db_collection.delete_one({"_id": "default"})
+# To download a document
+@app.get("/documents", tags=["rag"])
+async def list_all_documents():
+    try:
+        cursor = documents_collection.find({})
+        documents = []
+        async for doc in cursor:
+            # Create a copy without the binary content
+            documents.append({
+                "id": str(doc["_id"]),
+                "filename": doc.get("filename"),
+                "upload_date": doc.get("upload_date"),
+                "file_size": doc.get("file_size"),
+                "chunks": doc.get("chunks"),
+            })
+        return {"documents": documents}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# To get Chat History
+@app.get("/chat-history/{user_id}")
+async def get_chat_history(user_id: str):
+    try:
+        cursor = chatbot_history_collection.find({"userId": user_id})
+        chat_history = []
+        async for doc in cursor:
+            # Convert ObjectId to string
+            doc["_id"] = str(doc["_id"])
+            chat_history.append(doc)
+        if not chat_history:
+            raise HTTPException(status_code=404, detail="No chat history found for this user")
+        return {"user_id": user_id, "chat_history": chat_history}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# To list uploaded documents
+@app.get("/documents/user/{user_id}", tags=["rag"])
+async def list_documents(user_id: str):
+    try:
+        cursor = documents_collection.find({"user_id": user_id})
+        documents = []
+        async for doc in cursor:
+            # Create a copy without the binary content
+            doc_info = {
+                "id": str(doc["_id"]),
+                "filename": doc["filename"],
+                "upload_date": doc["upload_date"],
+                "file_size": doc["file_size"],
+                "chunks": doc["chunks"],
+            }
+            documents.append(doc_info)
+        return {"user_id": user_id, "documents": documents}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# To get knowledge base info
+@app.get("/knowledge-base-info", tags=["rag"])
+async def get_knowledge_base_info():
+    stats = get_collection_stats()
+    return {
+        "documents_loaded": stats["exists"],
+        "total_chunks": stats["document_count"]
+    }
+embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002" , openai_api_key = "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")
+@app.post("/search_video")
+async def search_video(req: SearchRequest):
+        if not req.query.strip():
+            raise HTTPException(status_code=400, detail="Search query cannot be empty")
+        try:
+            # 1) Fetch up to 1000 docs (just the link+description)
+            docs = await videos_collection.find(
+                {},
+                {"link": 1, "description": 1, "_id": 0}
+            ).to_list(length=1000)
+            if not docs:
+                raise HTTPException(status_code=404, detail="No videos found")
+            # 2) Get descriptions as a list
+            descriptions = [doc["description"] for doc in docs]
+            # 3) Embed query using LangChain's OpenAIEmbeddings
+            query_embedding = embeddings_model.embed_query(req.query)
+            # 4) Embed all descriptions (this is still inefficient but uses LangChain)
+            description_embeddings = embeddings_model.embed_documents(descriptions)
+            # 5) Compute similarities
+            similarities = [cosine_similarity(query_embedding, desc_emb)
+                            for desc_emb in description_embeddings]
+            # 6) Pick best match
+            best_idx = int(np.argmax(similarities))
+            best_doc = docs[best_idx]
+            return {
+                "link": best_doc["link"],
+                "description": best_doc["description"],
+                "score": similarities[best_idx]
+            }
+        except Exception as e:
+            logging.error(f"Error during video search: {str(e)}")
+            raise HTTPException(status_code=500, detail="Error processing search request")
+# @app.post("/start-trial/{user_id}", response_model=TrialResponse)
+# async def start_trial(user_id: str):
+#     trial = await subscriptions_collection.find_one({
+#         "user_id": user_id,
+#         "plan": "trial"
+#     })
+#     if trial:
+#         # Optional: Mark expired trial as "expired" for clarity
+#         if trial["end_date"] < datetime.utcnow() and trial["status"] == "active":
+#             await subscriptions_collection.update_one(
+#                 {"_id": trial["_id"]},
+#                 {"$set": {"status": "expired", "updated_at": datetime.utcnow()}}
+#             )
+#         raise HTTPException(400, "You have already used your free trial. Please subscribe to continue.")
+#     now = datetime.utcnow()
+#     trial = {
+#         "user_id": user_id,
+#         "plan": "trial",
+#         "status": "active",
+#         "start_date": now,
+#         "end_date": now + timedelta(days=3),
+#         "created_at": now,
+#         "updated_at": now
+#     }
+#     await subscriptions_collection.insert_one(trial)
+#     return {"status": "trial started", "expires": trial["end_date"]}
+async def has_valid_active_subscription(user_id: str) -> bool:
+    now = datetime.utcnow()
+    sub = await subscriptions_collection.find_one({
+        "user_id": user_id,
+        "status": "active",
+        "end_date": {"$gt": now}
+    })
+    return sub is not None
+async def expire_old_subscriptions(user_id: str):
+    now = datetime.utcnow()
+    await subscriptions_collection.update_many(
+        {
+            "user_id": user_id,
+            "status": "active",
+            "end_date": {"$lte": now}  # already expired
+        },
+        {"$set": {"status": "expired", "updated_at": now}}
+    )
+@app.post("/start-trial/{user_id}", response_model=TrialResponse)
+async def start_trial(user_id: str):
+    await expire_old_subscriptions(user_id)
+    if await has_valid_active_subscription(user_id):
+        raise HTTPException(400, "You already have an active subscription.")
+    now = datetime.utcnow()
+    trial = {
+        "user_id": user_id,
+        "plan": "trial",
+        "status": "active",
+        "start_date": now,
+        "end_date": now + timedelta(days=3),
+        "created_at": now,
+        "updated_at": now
+    }
+    await subscriptions_collection.insert_one(trial)
+    return {"status": "trial started", "expires": trial["end_date"].isoformat()}
+@app.post("/create-subscription-order")
+async def create_sub_order(sub_req: SubscriptionRequest):  # Pass it from frontend (POST body or query param)
+    user_id = sub_req.user_id
+    plan = sub_req.plan
+    price_map = {
+    "monthly": "10.00",
+    "yearly": "100.00"
+    }
+    if plan not in price_map:
+        raise HTTPException(400, "Invalid plan selected")
+    price = price_map[plan]
+    request = OrdersCreateRequest()
+    request.prefer("return=representation")
+    request.request_body({
+        "intent": "CAPTURE",
+        "purchase_units": [{
+            "custom_id": user_id,  # This can be any internal identifier
+            "amount": {
+                "currency_code": "USD",
+                "value": price
+            }
+        }],
+        "application_context": {
+            "return_url": f"http://localhost:3000/payments",  # optional
+            "cancel_url": "http://localhost:3000/payments"
+        }
+    })
+    response = p_client.execute(request)
+    result = response.result
+    approve_url = next((link.href for link in result.links if link.rel == "approve"), None)
+    now = datetime.utcnow()
+    await orders_collection.insert_one({
+    "paypal_order_id": result.id,
+    "user_id": user_id,
+    "plan": plan,
+    "status": result.status,
+    "created_at": now,
+    "updated_at": now
+        })
+    return {
+        "order_id": result.id,
+        "status": result.status,
+        "approve_url": approve_url
+    }
+@app.post("/capture-order/{order_id}", response_model=dict)
+async def capture_order(order_id: str):
+    # ... your existing PayPal capture code here ...
+    req = OrdersCaptureRequest(order_id)
+    req.request_body({})
+    # ❌ Don't await this — it's synchronous
+    # resp = p_client.execute(req)
+    import asyncio
+    resp = await asyncio.to_thread(p_client.execute, req)
+    if resp.status_code != 201:
+        raise HTTPException(400, "Failed to capture order")
+    cap = resp.result
+    order_doc = await orders_collection.find_one({"paypal_order_id": order_id})
+    if not order_doc:
+        raise HTTPException(404, "Order not found")
+    # Expire old subscriptions before inserting new one
+    await expire_old_subscriptions(order_doc["user_id"])
+    if await has_valid_active_subscription(order_doc["user_id"]):
+        raise HTTPException(400, "You already have an active subscription.")
+    now = datetime.utcnow()
+    days = 30 if order_doc["plan"] == "monthly" else 365
+    sub = {
+        "user_id": order_doc["user_id"],
+        "plan": order_doc["plan"],
+        "status": "active",
+        "start_date": now,
+        "end_date": now + timedelta(days=days),
+        "paypal_order_id": order_id,
+        "created_at": now,
+        "updated_at": now
+    }
+    await subscriptions_collection.insert_one(sub)
+    # Update order status
+    await orders_collection.update_one(
+        {"paypal_order_id": order_id},
+        {"$set": {"status": cap.status}}
+    )
+    return {"status": cap.status}
+# @app.post("/capture-order/{order_id}", response_model=dict)
+# async def capture_order(order_id: str):
+#     req = OrdersCaptureRequest(order_id)
+#     req.request_body({})
+#     # ❌ Don't await this — it's synchronous
+#     # resp = p_client.execute(req)
+#     import asyncio
+#     resp = await asyncio.to_thread(p_client.execute, req)
+#     if resp.status_code != 201:
+#         raise HTTPException(400, "Failed to capture order")
+#     cap = resp.result
+#     order_doc = await orders_collection.find_one({"paypal_order_id": order_id})
+#     if not order_doc:
+#         raise HTTPException(404, "Order not found")
+#     await orders_collection.update_one(
+#         {"paypal_order_id": order_id},
+#         {"$set": {"status": cap.status}}
+#     )
+#     # Insert into subscriptions if plan exists
+#     if "plan" in order_doc:
+#         now = datetime.utcnow()
+#         days = 30 if order_doc["plan"] == "monthly" else 365
+#         sub = {
+#             "user_id": order_doc["user_id"],
+#             "plan": order_doc["plan"],
+#             "status": "active",
+#             "start_date": now,
+#             "end_date": now + timedelta(days=days),
+#             "paypal_order_id": order_id,
+#             "created_at": now,
+#             "updated_at": now
+#         }
+#         existing = await subscriptions_collection.find_one({"user_id": order_doc["user_id"]})
+#         if existing:
+#     # Optional: You can update the existing subscription instead of inserting
+#             await subscriptions_collection.update_one(
+#         {"user_id": order_doc["user_id"]},
+#         {"$set": {
+#             "plan": order_doc["plan"],
+#             "status": "active",
+#             "start_date": now,
+#             "end_date": now + timedelta(days=days),
+#             "paypal_order_id": order_id,
+#             "updated_at": now
+#         }}
+#     )
+#         else:
+#             await subscriptions_collection.insert_one(sub)
+#     return {"status": cap.status}
+@app.post("/cancel", response_model=dict)
+async def cancel_subscription(user_id: str):
+    subscription = await subscriptions_collection.find_one({
+        "user_id": user_id,
+        "status": "active"
+    })
+    if not subscription:
+        raise HTTPException(status_code=404, detail="Active subscription not found")
+    await subscriptions_collection.update_one(
+        {"_id": subscription["_id"]},
+        {"$set": {"status": "cancelled", "updated_at": datetime.utcnow()}}
+    )
+    return {"status": "cancelled", "message": "Subscription cancelled successfully"}
+# @app.get("/payments")
+# async def success_page(user_id: str, token: str = "", PayerID: str = ""):
+#     return {"message": "Payment approved", "user_id": user_id, "token": token, "PayerID": PayerID}
+@app.post("/webhook")
+async def paypal_webhook(request: Request):
+    event = await request.json()
+    if event.get("event_type") == "PAYMENT.CAPTURE.COMPLETED":
+        rid = (event["resource"]
+               .get("supplementary_data", {})
+               .get("related_ids", {})
+               .get("order_id"))
+        if rid:
+            await orders_collection.update_one(
+              {"paypal_order_id": rid},
+              {"$set": {"status": "COMPLETED"}}
+            )
+    return {"ok": True}
+# To Health Check APP
+@app.get("/health", tags=["health"])
+async def health_check():
+    try:
+        await client.admin.command("ping")
+        db_status = "connected"
+    except Exception as e:
+        db_status = f"error: {str(e)}"
+    # Check if vector store is accessible
+    try:
+        stats = get_collection_stats()
+        vector_db_status = "connected" if stats["exists"] else "empty but ready"
+    except Exception as e:
+        vector_db_status = f"error: {str(e)}"
+    return {
+        "status": "healthy",
+        "database": db_status,
+        "vector_database": vector_db_status,
+        "storage_type": "mongodb"
+    }
+# if __name__ == "__main__":
+#     import uvicorn
+#     uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8001)))

App/utils.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import os
+import json
+import pickle
+from typing import List, Optional, BinaryIO, Dict, Any
+from langchain_community.document_loaders import Docx2txtLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain_openai import ChatOpenAI
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema import Document
+import tempfile
+from pymongo import MongoClient
+from bson.binary import Binary
+import uuid
+import tiktoken
+from googleapiclient.discovery import build
+import numpy as np
+# MongoDB connection
+MONGODB_URI = os.getenv(
+    "MONGODB_URI",
+    "mongodb+srv://ahmed0499280:haseeb.2003@cluster0.hzgrxp2.mongodb.net/"
+    "?retryWrites=true&w=majority&appName=Cluster0"
+)
+# MongoDB client
+client = MongoClient(MONGODB_URI)
+db = client["Cluster0"]
+chroma_db_collection = db["chroma_db_store"]  # Collection for storing Chroma DB
+YOUTUBE_API_KEY = "AIzaSyDIaXWJJX2W8swWl093DMyNZ_7TZUGe3DI"
+youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
+# Create a custom in-memory ChromaDB client
+class MongoChromaStore:
+    """Custom storage for Chroma that uses MongoDB instead of disk"""
+    @staticmethod
+    def save_chroma(chroma_db, collection_name="default"):
+        """Save a Chroma DB to MongoDB"""
+        try:
+            # Extract necessary data from Chroma DB
+            # This is a simplification - in a real implementation we'd need to extract more data
+            embeddings = chroma_db._collection.get()
+            # Prepare data for MongoDB storage
+            chroma_data = {
+                "_id": collection_name,
+                "embeddings": Binary(pickle.dumps(embeddings)),
+                "last_updated": pickle.dumps(embeddings["metadatas"] if "metadatas" in embeddings else [])
+            }
+            # Store or update in MongoDB
+            chroma_db_collection.replace_one(
+                {"_id": collection_name},
+                chroma_data,
+                upsert=True
+            )
+            return True
+        except Exception as e:
+            print(f"Error saving Chroma DB to MongoDB: {e}")
+            return False
+def count_tokens(text, model_name="gpt-3.5-turbo"):
+    """Count tokens for a text string."""
+    try:
+        encoding = tiktoken.encoding_for_model(model_name)
+    except KeyError:
+        # Fall back to cl100k_base encoding if model not found
+        encoding = tiktoken.get_encoding("cl100k_base")
+    return len(encoding.encode(text))
+# Video search
+def search_youtube_video(query: str) -> str:
+    """
+    Search YouTube for the top video matching `query` and return its videoId.
+    """
+    res = (
+        youtube.search()
+        .list(
+            q=query,
+            part="id,snippet",
+            type="video",
+            maxResults=1
+        )
+        .execute()
+    )
+    items = res.get("items", [])
+    if not items:
+        raise ValueError("No video found for query.")
+    return items[0]["id"]["videoId"]
+# Load and split DOCX into chunks (from file path)
+def load_and_split(filepath: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Document]:
+    loader = Docx2txtLoader(filepath)
+    docs = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    return text_splitter.split_documents(docs)
+# Load and split DOCX from bytes (for MongoDB storage)
+def load_and_split_bytes(file_bytes: BinaryIO, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Document]:
+    # Create a temporary file
+    with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
+        temp_path = temp_file.name
+        # If file_bytes is a BytesIO object, get its content
+        if hasattr(file_bytes, 'read'):
+            file_bytes.seek(0)
+            content = file_bytes.read()
+            temp_file.write(content)
+        else:
+            # If it's already bytes
+            temp_file.write(file_bytes)
+    try:
+        # Process the temp file
+        return load_and_split(temp_path, chunk_size, chunk_overlap)
+    finally:
+        # Clean up
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+# Build Chroma index and save to MongoDB
+def build_chroma_index(docs, embedding_model: str = "Omartificial-Intelligence-Space/GATE-AraBert-v1", collection_name: str = "default"):
+    # Create temporary directory for Chroma
+    temp_dir = tempfile.mkdtemp()
+    try:
+        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
+        # Create or update Chroma DB
+        chroma_db = Chroma.from_documents(
+            docs,
+            embeddings,
+            persist_directory=temp_dir,
+            collection_name=collection_name
+        )
+        # Save Chroma DB to MongoDB
+        MongoChromaStore.save_chroma(chroma_db, collection_name)
+        # Return the retriever
+        return chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
+    finally:
+        # Clean up temporary directory
+        import shutil
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+# Get existing Chroma DB from MongoDB
+def get_existing_retriever(embedding_model: str = "Omartificial-Intelligence-Space/GATE-AraBert-v1", collection_name: str = "default"):
+    # Check if collection exists in MongoDB
+    chroma_data = chroma_db_collection.find_one({"_id": collection_name})
+    if not chroma_data:
+        return None
+    try:
+        # Create temporary directory for Chroma
+        temp_dir = tempfile.mkdtemp()
+        # Deserialize embeddings from MongoDB
+        embeddings_data = pickle.loads(chroma_data["embeddings"])
+        # Use the embeddings to recreate the Chroma DB
+        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
+        # At this point we would need to reconstruct the Chroma DB
+        # This is a simplified implementation that doesn't fully work
+        # In a production system, you would need a more complete solution
+        # For now, let's create a new Chroma DB and add the documents
+        # This is not ideal but shows the concept
+        if "documents" in embeddings_data and embeddings_data["documents"]:
+            # Create documents from the stored data
+            docs = []
+            for i, text in enumerate(embeddings_data["documents"]):
+                metadata = embeddings_data["metadatas"][i] if "metadatas" in embeddings_data else {}
+                docs.append(Document(page_content=text, metadata=metadata))
+            # Create a new Chroma DB
+            chroma_db = Chroma.from_documents(
+                docs,
+                embeddings,
+                persist_directory=temp_dir,
+                collection_name=collection_name
+            )
+            return chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
+    except Exception as e:
+        print(f"Error loading Chroma DB from MongoDB: {e}")
+        return None
+    finally:
+        # Clean up temporary directory
+        import shutil
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
+# Get document count in the collection
+def get_collection_stats(collection_name: str = "default"):
+    # Check if collection exists in MongoDB
+    chroma_data = chroma_db_collection.find_one({"_id": collection_name})
+    if not chroma_data:
+        return {"exists": False, "document_count": 0}
+    try:
+        # Deserialize embeddings from MongoDB
+        embeddings_data = pickle.loads(chroma_data["embeddings"])
+        # Count documents
+        doc_count = len(embeddings_data["documents"]) if "documents" in embeddings_data else 0
+        return {
+            "exists": True,
+            "document_count": doc_count
+        }
+    except Exception as e:
+        print(f"Error getting collection stats: {e}")
+        return {"exists": False, "document_count": 0}
+# Instantiate LLM (Google Gemini or OpenAI)
+def get_llm(temperature: float = 0.0):
+    return ChatOpenAI(model="o4-mini", api_key="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")
+def create_rag_chain_with_history(retriever, llm, lan, level,diacritics=False, history=None ):
+    if history is None:
+        history = []
+    # Create the base system prompt based on language
+    if lan.lower() == 'arabic' and diacritics == True:
+        system_prompt = (
+    "You are an Assistant for answering questions. "
+    "Use the following retrieved context snippets to answer. "
+    "Look for the relevance between the context and the question before answering. "
+    "If you do not know the answer, say that you do not know. "
+    "Be polite, act like a teacher, and provide as detailed an answer as possible based on the context. "
+    "Consider the conversation history when responding. "
+    "You are designed to help Muslims learn Arabic, so explanations should be culturally respectful and appropriate. "
+    "Be responsive to the user's needs—if the user seems stuck or confused during the chat, proactively offer helpful suggestions, clarifications, or encouragement. "
+    "Adjust your explanations according to the student's level: "
+    "for 'beginner', use very simple language, break down grammar and context step-by-step, and give clear examples; "
+    "for 'intermediate', provide more detailed grammar and usage insights with moderate complexity; "
+    "for 'advanced', include deeper linguistic explanations, nuanced examples, and encourage self-reflection. "
+    "Grammar and contextual explanations should start at the appropriate level and build gradually. "
+    "Include examples from the connected knowledge base when possible; otherwise, generate clear and relevant examples yourself. "
+    f"Always respond in {lan} *with all proper diacritics*. "
+    f"Student level: {level}."
+    "{context}"
+    )
+    elif lan.lower() == 'arabic' and diacritics == False:
+        system_prompt = (
+    "You are an Assistant for answering questions. "
+    "Use the following retrieved context snippets to answer. "
+    "Look for the relevance between the context and the question before answering. "
+    "If you do not know the answer, say that you do not know. "
+    "Be polite, act like a teacher, and provide as detailed an answer as possible based on the context. "
+    "Consider the conversation history when responding. "
+    "You are designed to help Muslims learn Arabic, so explanations should be culturally respectful and appropriate. "
+    "Be responsive to the user's needs—if the user seems stuck or confused during the chat, proactively offer helpful suggestions, clarifications, or encouragement. "
+    "Adjust your explanations according to the student's level: "
+    "for 'beginner', use very simple language, break down grammar and context step-by-step, and give clear examples; "
+    "for 'intermediate', provide more detailed grammar and usage insights with moderate complexity; "
+    "for 'advanced', include deeper linguistic explanations, nuanced examples, and encourage self-reflection. "
+    "Grammar and contextual explanations should start at the appropriate level and build gradually. "
+    "Include examples from the connected knowledge base when possible; otherwise, generate clear and relevant examples yourself. "
+    f"Always respond in {lan} *without diacritics*"
+    f"Student level: {level}."
+    "{context}"
+    )
+    else:
+        system_prompt = (
+    "You are an Assistant for answering questions. "
+    "Use the following retrieved context snippets to answer. "
+    "Look for the relevance between the context and the question before answering. "
+    "If you do not know the answer, say that you do not know. "
+    "Be polite, act like a teacher, and provide as detailed an answer as possible based on the context. "
+    "Consider the conversation history when responding. "
+    "You are designed to help Muslims learn Arabic, so explanations should be culturally respectful and appropriate. "
+    "Be responsive to the user's needs—if the user seems stuck or confused during the chat, proactively offer helpful suggestions, clarifications, or encouragement. "
+    "Adjust your explanations according to the student's level: "
+    "for 'beginner', use very simple language, break down grammar and context step-by-step, and give clear examples; "
+    "for 'intermediate', provide more detailed grammar and usage insights with moderate complexity; "
+    "for 'advanced', include deeper linguistic explanations, nuanced examples, and encourage self-reflection. "
+    "Grammar and contextual explanations should start at the appropriate level and build gradually. "
+    "Include examples from the connected knowledge base when possible; otherwise, generate clear and relevant examples yourself. "
+    f"Always respond in {lan}. "
+    f"Student level: {level}. "
+    "{context}"
+    )
+    # Create messages with history
+    messages = [('system', system_prompt)]
+    # Add conversation history
+    for message in history:
+        messages.append((message["role"], message["content"]))
+    # Add current user query
+    messages.append(('human', '{input}'))
+    prompt = ChatPromptTemplate.from_messages(messages)
+    question_answer_chain = create_stuff_documents_chain(llm, prompt)
+    return create_retrieval_chain(retriever, question_answer_chain)
+# Additional function to add documents to existing index
+def add_documents_to_index(docs, embedding_model: str = "Omartificial-Intelligence-Space/GATE-AraBert-v1", collection_name: str = "default"):
+    # Get existing retriever
+    existing_retriever = get_existing_retriever(embedding_model, collection_name)
+    # If no existing retriever, create a new one
+    if not existing_retriever:
+        return build_chroma_index(docs, embedding_model, collection_name)
+    # If we have an existing retriever, we need to add documents to it
+    # This is a simplified implementation
+    # In a production system, you would need a more complete solution
+    # Create temporary directory for Chroma
+    temp_dir = tempfile.mkdtemp()
+    try:
+        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
+        # Get existing documents
+        chroma_data = chroma_db_collection.find_one({"_id": collection_name})
+        if chroma_data:
+            embeddings_data = pickle.loads(chroma_data["embeddings"])
+            # Create documents from the stored data
+            existing_docs = []
+            if "documents" in embeddings_data and embeddings_data["documents"]:
+                for i, text in enumerate(embeddings_data["documents"]):
+                    metadata = embeddings_data["metadatas"][i] if "metadatas" in embeddings_data else {}
+                    existing_docs.append(Document(page_content=text, metadata=metadata))
+            # Combine with new documents
+            all_docs = existing_docs + docs
+            # Create a new Chroma DB with all documents
+            chroma_db = Chroma.from_documents(
+                all_docs,
+                embeddings,
+                persist_directory=temp_dir,
+                collection_name=collection_name
+            )
+            # Save Chroma DB to MongoDB
+            MongoChromaStore.save_chroma(chroma_db, collection_name)
+            return chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
+        else:
+            # If no existing data, create a new index
+            return build_chroma_index(docs, embedding_model, collection_name)
+    finally:
+        # Clean up temporary directory
+        import shutil
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+# Use an official Python image
+FROM python:3.10
+# Set the working directory
+WORKDIR /code
+# Copy files
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY ./app ./app
+# Expose port
+EXPOSE 7860
+# Run the FastAPI app using uvicorn
+CMD ["uvicorn", "App.main:app", "--host", "0.0.0.0", "--port", "7860"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,179 @@

+aiohappyeyeballs==2.6.1
+aiohttp==3.11.18
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+asgiref==3.8.1
+attrs==25.3.0
+backoff==2.2.1
+bcrypt==4.3.0
+build==1.2.2.post1
+cachetools==5.5.2
+certifi==2025.4.26
+cffi==1.17.1
+charset-normalizer==3.4.2
+chromadb==1.0.8
+click==8.1.8
+coloredlogs==15.0.1
+cryptography==44.0.3
+dataclasses-json==0.6.7
+defusedxml==0.7.1
+Deprecated==1.2.18
+distro==1.9.0
+dnspython==2.7.0
+docx2txt==0.9
+durationpy==0.9
+ecdsa==0.19.1
+email_validator==2.2.0
+fastapi==0.115.9
+filelock==3.18.0
+filetype==1.2.0
+flatbuffers==25.2.10
+frozenlist==1.6.0
+fsspec==2025.3.2
+google-ai-generativelanguage==0.6.18
+google-api-core==2.24.2
+google-api-python-client==2.169.0
+google-auth==2.40.1
+google-auth-httplib2==0.2.0
+googleapis-common-protos==1.70.0
+greenlet==3.2.1
+grpcio==1.71.0
+grpcio-status==1.71.0
+h11==0.16.0
+httpcore==1.0.9
+httplib2==0.22.0
+httptools==0.6.4
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.30.2
+humanfriendly==10.0
+idna==3.10
+importlib_metadata==8.6.1
+importlib_resources==6.5.2
+Jinja2==3.1.6
+jiter==0.9.0
+joblib==1.5.0
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2025.4.1
+jwt==1.3.1
+kubernetes==32.0.1
+langchain==0.3.25
+langchain-community==0.3.23
+langchain-core==0.3.58
+langchain-google-genai==2.1.4
+langchain-huggingface==0.1.2
+langchain-openai==0.3.16
+langchain-text-splitters==0.3.8
+langsmith==0.3.42
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+mdurl==0.1.2
+mmh3==5.1.0
+motor==3.7.0
+mpmath==1.3.0
+multidict==6.4.3
+mypy_extensions==1.1.0
+networkx==3.4.2
+numpy==2.2.5
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+oauthlib==3.2.2
+onnxruntime==1.21.1
+openai==1.77.0
+opentelemetry-api==1.32.1
+opentelemetry-exporter-otlp-proto-common==1.32.1
+opentelemetry-exporter-otlp-proto-grpc==1.32.1
+opentelemetry-instrumentation==0.53b1
+opentelemetry-instrumentation-asgi==0.53b1
+opentelemetry-instrumentation-fastapi==0.53b1
+opentelemetry-proto==1.32.1
+opentelemetry-sdk==1.32.1
+opentelemetry-semantic-conventions==0.53b1
+opentelemetry-util-http==0.53b1
+orjson==3.10.18
+overrides==7.7.0
+packaging==24.2
+passlib==1.7.4
+paypal-checkout-serversdk==1.0.3
+paypalhttp==1.0.1
+pillow==11.2.1
+posthog==4.0.1
+propcache==0.3.1
+proto-plus==1.26.1
+protobuf==5.29.4
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pycparser==2.22
+pydantic==2.11.4
+pydantic-settings==2.9.1
+pydantic_core==2.33.2
+Pygments==2.19.1
+pymongo==4.12.1
+pyOpenSSL==25.1.0
+pyparsing==3.2.3
+PyPika==0.48.9
+pyproject_hooks==1.2.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-jose==3.4.0
+python-multipart==0.0.20
+PyYAML==6.0.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rich==14.0.0
+rpds-py==0.24.0
+rsa==4.9.1
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+sentence-transformers==4.1.0
+setuptools==80.3.1
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+SQLAlchemy==2.0.40
+starlette==0.45.3
+sympy==1.14.0
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tiktoken==0.9.0
+tokenizers==0.21.1
+torch==2.7.0
+tqdm==4.67.1
+transformers==4.51.3
+triton==3.3.0
+typer==0.15.3
+typing-inspect==0.9.0
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+uritemplate==4.1.1
+urllib3==2.4.0
+uvicorn==0.34.2
+uvloop==0.21.0
+watchfiles==1.0.5
+websocket-client==1.8.0
+websockets==15.0.1
+wrapt==1.17.2
+yarl==1.20.0
+youtube-transcript-api==1.0.3
+zipp==3.21.0
+zstandard==0.23.0