Spaces:
Sleeping
Sleeping
Commit ·
a249293
1
Parent(s): 3fe718a
Deployment
Browse files- .env +5 -0
- App/__pycache__/main.cpython-312.pyc +0 -0
- App/__pycache__/utils.cpython-312.pyc +0 -0
- App/main.py +1061 -0
- App/utils.py +376 -0
- Dockerfile +12 -0
- requirements.txt +179 -0
.env
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PAYPAL_CLIENT_ID = "AQL2cqJbTd3yoVyOk7fGhwsvH0vfZKi8jHz0RlA40ZyZ0N8pJZZ_A7KuzxMr7w6oiKDGlz44EYVl05qs"
|
| 2 |
+
PAYPAL_CLIENT_ID_sn = "AawVJHHptCzgOKPLacd0VOhziveuy63nA7M35HT--9U5BgkWzixkQSI4vRUvPlMnVG8nGocn8muw6QAC"
|
| 3 |
+
PAYPAL_CLIENT_SECRET = "ENhmJb_su9sjwh5ti4yLkLwW3GR0SYjTVrvZHjecBwM_DPoVWJX1GUaT6Nb2TGj-eEUXBqREFSYQiIDt"
|
| 4 |
+
PAYPAL_CLIENT_SECRET_sn = "ECXLFuvC5aAgC5wDFhSM1CTXh4M0MIJ4UsV6CagqZlp5FY9cGY1qNql7-dmaylp4mG0WqC3M-wUsqvgM"
|
| 5 |
+
PAYPAL_ENVIRONMENT = "sandbox"
|
App/__pycache__/main.cpython-312.pyc
ADDED
|
Binary file (39.1 kB). View file
|
|
|
App/__pycache__/utils.cpython-312.pyc
ADDED
|
Binary file (14.2 kB). View file
|
|
|
App/main.py
ADDED
|
@@ -0,0 +1,1061 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uuid
|
| 3 |
+
import base64
|
| 4 |
+
import bcrypt
|
| 5 |
+
from datetime import datetime, timedelta , timezone
|
| 6 |
+
import io
|
| 7 |
+
from typing import Optional , Literal
|
| 8 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException, Depends, Query as QueryParam , Request
|
| 9 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 10 |
+
from pydantic import BaseModel, EmailStr, Field , HttpUrl
|
| 11 |
+
from jose import jwt
|
| 12 |
+
from motor.motor_asyncio import AsyncIOMotorClient
|
| 13 |
+
from bson.binary import Binary
|
| 14 |
+
from fastapi import status
|
| 15 |
+
# from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound
|
| 16 |
+
from App.utils import (
|
| 17 |
+
load_and_split_bytes,
|
| 18 |
+
add_documents_to_index,
|
| 19 |
+
get_llm,
|
| 20 |
+
get_existing_retriever,
|
| 21 |
+
get_collection_stats ,
|
| 22 |
+
build_chroma_index ,
|
| 23 |
+
create_rag_chain_with_history ,
|
| 24 |
+
count_tokens ,
|
| 25 |
+
search_youtube_video ,
|
| 26 |
+
cosine_similarity
|
| 27 |
+
)
|
| 28 |
+
from bson import ObjectId
|
| 29 |
+
import openai
|
| 30 |
+
import numpy as np
|
| 31 |
+
import logging
|
| 32 |
+
from langchain_openai import OpenAIEmbeddings
|
| 33 |
+
import os
|
| 34 |
+
from dotenv import load_dotenv
|
| 35 |
+
from paypalcheckoutsdk.core import PayPalHttpClient, SandboxEnvironment, LiveEnvironment
|
| 36 |
+
from paypalcheckoutsdk.orders import OrdersCreateRequest, OrdersCaptureRequest
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
load_dotenv()
|
| 40 |
+
|
| 41 |
+
env_name = os.getenv("PAYPAL_ENVIRONMENT", "sandbox")
|
| 42 |
+
creds = dict(
|
| 43 |
+
client_id = os.getenv("PAYPAL_CLIENT_ID_sn"),
|
| 44 |
+
client_secret = os.getenv("PAYPAL_CLIENT_SECRET_sn"),
|
| 45 |
+
)
|
| 46 |
+
environment = (
|
| 47 |
+
LiveEnvironment(**creds)
|
| 48 |
+
if env_name == "live"
|
| 49 |
+
else SandboxEnvironment(**creds)
|
| 50 |
+
)
|
| 51 |
+
p_client = PayPalHttpClient(environment)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# Configuration for JWT
|
| 56 |
+
SECRET_KEY = "hssjhdahsd"
|
| 57 |
+
ALGORITHM = "HS256"
|
| 58 |
+
ACCESS_TOKEN_EXPIRE_MINUTES = 30
|
| 59 |
+
|
| 60 |
+
app = FastAPI()
|
| 61 |
+
app.add_middleware(
|
| 62 |
+
CORSMiddleware,
|
| 63 |
+
allow_origins=["*" , 'http://localhost:3000'],
|
| 64 |
+
allow_credentials=True,
|
| 65 |
+
allow_methods=["*"],
|
| 66 |
+
allow_headers=["*"],
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# MongoDB connection
|
| 70 |
+
connection_string = os.getenv(
|
| 71 |
+
"MONGODB_URI",
|
| 72 |
+
"mongodb+srv://ahmed0499280:haseeb.2003@cluster0.hzgrxp2.mongodb.net/"
|
| 73 |
+
"?retryWrites=true&w=majority&appName=Cluster0"
|
| 74 |
+
)
|
| 75 |
+
client = AsyncIOMotorClient(connection_string)
|
| 76 |
+
db = client["Cluster0"]
|
| 77 |
+
users_collection = db["users"]
|
| 78 |
+
chatbot_history_collection = db["chatbothistory"]
|
| 79 |
+
documents_collection = db["documents"] # Collection to store document files
|
| 80 |
+
chroma_db_collection = db["chroma_db_store"] # Collection to store Chroma DB
|
| 81 |
+
flash_cards = db['flash_cards']
|
| 82 |
+
videos_collection = db['videos']
|
| 83 |
+
orders_collection = db["orders"]
|
| 84 |
+
subscriptions_collection = db["subscriptions"]
|
| 85 |
+
# Default language setting
|
| 86 |
+
language = None
|
| 87 |
+
|
| 88 |
+
# Pydantic models
|
| 89 |
+
class CreateSubOrderSchema(BaseModel):
|
| 90 |
+
user_id: str
|
| 91 |
+
plan: Literal["monthly", "yearly"]
|
| 92 |
+
|
| 93 |
+
class TrialResponse(BaseModel):
|
| 94 |
+
status: str
|
| 95 |
+
expires: datetime
|
| 96 |
+
|
| 97 |
+
class OrderResponse(BaseModel):
|
| 98 |
+
order_id: str
|
| 99 |
+
status: str
|
| 100 |
+
|
| 101 |
+
class SubscriptionRequest(BaseModel):
|
| 102 |
+
user_id: str
|
| 103 |
+
plan: str
|
| 104 |
+
|
| 105 |
+
class SearchRequest(BaseModel):
|
| 106 |
+
query: str
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class Video(BaseModel):
|
| 110 |
+
link: str
|
| 111 |
+
description: str
|
| 112 |
+
|
| 113 |
+
class LanguageRequest(BaseModel):
|
| 114 |
+
language: str
|
| 115 |
+
|
| 116 |
+
class QueryModel(BaseModel):
|
| 117 |
+
question: str
|
| 118 |
+
user_id: str
|
| 119 |
+
diacritics:bool
|
| 120 |
+
level : str
|
| 121 |
+
|
| 122 |
+
class UserSignup(BaseModel):
|
| 123 |
+
username: str = Field(..., min_length=3, max_length=50)
|
| 124 |
+
email: EmailStr
|
| 125 |
+
password: str = Field(..., min_length=8)
|
| 126 |
+
language: str
|
| 127 |
+
is_admin : bool
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class UserModel(BaseModel):
|
| 131 |
+
id: str
|
| 132 |
+
username: str
|
| 133 |
+
email: str
|
| 134 |
+
language: str
|
| 135 |
+
is_admin: bool
|
| 136 |
+
password: Optional[str] = None
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
class UserResponse(BaseModel):
|
| 140 |
+
id: str
|
| 141 |
+
username: str
|
| 142 |
+
email: str
|
| 143 |
+
|
| 144 |
+
class UserLogin(BaseModel):
|
| 145 |
+
email: EmailStr
|
| 146 |
+
password: str
|
| 147 |
+
|
| 148 |
+
class Token(BaseModel):
|
| 149 |
+
access_token: str
|
| 150 |
+
token_type: str
|
| 151 |
+
|
| 152 |
+
class DocumentInfo(BaseModel):
|
| 153 |
+
filename: str
|
| 154 |
+
upload_date: datetime
|
| 155 |
+
file_size: int
|
| 156 |
+
chunks: int
|
| 157 |
+
user_id: str
|
| 158 |
+
|
| 159 |
+
class FlashcardSaveModel(BaseModel):
|
| 160 |
+
user_id: str
|
| 161 |
+
question: str
|
| 162 |
+
answer: str
|
| 163 |
+
|
| 164 |
+
# Helper for login
|
| 165 |
+
async def authenticate_user(email: str, password: str):
|
| 166 |
+
user = await users_collection.find_one({"email": email})
|
| 167 |
+
if not user or not bcrypt.checkpw(password.encode(), user["password"].encode()):
|
| 168 |
+
return None
|
| 169 |
+
return user
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
@app.get("/users/{user_id}", response_model=UserModel)
|
| 173 |
+
async def get_user(user_id: str):
|
| 174 |
+
"""
|
| 175 |
+
Fetch a single user by their user_id.
|
| 176 |
+
Supports lookup by MongoDB ObjectId or string _id.
|
| 177 |
+
"""
|
| 178 |
+
# Build query: if user_id is a valid ObjectId, use that, else match string _id
|
| 179 |
+
try:
|
| 180 |
+
obj_id = ObjectId(user_id)
|
| 181 |
+
query = {"_id": obj_id}
|
| 182 |
+
except Exception:
|
| 183 |
+
query = {"_id": user_id}
|
| 184 |
+
|
| 185 |
+
user = await users_collection.find_one(query)
|
| 186 |
+
if not user:
|
| 187 |
+
raise HTTPException(status_code=404, detail="User not found")
|
| 188 |
+
|
| 189 |
+
# Prepare response: convert _id to id
|
| 190 |
+
user_data = {
|
| 191 |
+
"id": str(user.get("_id")),
|
| 192 |
+
"username": user.get("username"),
|
| 193 |
+
"email": user.get("email"),
|
| 194 |
+
"language": user.get("language"),
|
| 195 |
+
"is_admin": user.get("is_admin", False),
|
| 196 |
+
}
|
| 197 |
+
return user_data
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
# Initialize retriever at startup
|
| 201 |
+
@app.on_event("startup")
|
| 202 |
+
async def startup_event():
|
| 203 |
+
# Retrieve existing retriever if it exists
|
| 204 |
+
try:
|
| 205 |
+
retriever = get_existing_retriever()
|
| 206 |
+
if retriever:
|
| 207 |
+
app.state.retriever = retriever
|
| 208 |
+
stats = get_collection_stats()
|
| 209 |
+
print(f"Loaded existing vector database with {stats['document_count']} document chunks")
|
| 210 |
+
else:
|
| 211 |
+
app.state.retriever = None
|
| 212 |
+
print("No existing vector database found. Will create when documents are uploaded.")
|
| 213 |
+
except Exception as e:
|
| 214 |
+
print(f"Error loading vector database: {e}")
|
| 215 |
+
app.state.retriever = None
|
| 216 |
+
|
| 217 |
+
### API Endpoints
|
| 218 |
+
|
| 219 |
+
# To get Language
|
| 220 |
+
@app.post("/language")
|
| 221 |
+
async def receive_language(req: LanguageRequest):
|
| 222 |
+
global language
|
| 223 |
+
language = req.language
|
| 224 |
+
return {'Message': f"Language is Selected to '{language}'"}
|
| 225 |
+
|
| 226 |
+
# To SignUp
|
| 227 |
+
@app.post("/signup", response_model=UserResponse, tags=["auth"])
|
| 228 |
+
async def signup(user: UserSignup):
|
| 229 |
+
if await users_collection.find_one({"email": user.email}):
|
| 230 |
+
raise HTTPException(status_code=400, detail="Email already registered")
|
| 231 |
+
salt = bcrypt.gensalt()
|
| 232 |
+
hashed = bcrypt.hashpw(user.password.encode(), salt).decode()
|
| 233 |
+
user_id = str(uuid.uuid4())
|
| 234 |
+
await users_collection.insert_one({
|
| 235 |
+
"_id": user_id,
|
| 236 |
+
"username": user.username,
|
| 237 |
+
"email": user.email,
|
| 238 |
+
"password": hashed,
|
| 239 |
+
"language": user.language ,
|
| 240 |
+
"is_admin" : False
|
| 241 |
+
})
|
| 242 |
+
print(user.language)
|
| 243 |
+
return {"id": user_id, "username": user.username, "email": user.email}
|
| 244 |
+
|
| 245 |
+
# TO Login
|
| 246 |
+
@app.post("/login", response_model=Token, tags=["auth"])
|
| 247 |
+
async def login(credentials: UserLogin):
|
| 248 |
+
user = await authenticate_user(credentials.email, credentials.password)
|
| 249 |
+
if not user:
|
| 250 |
+
raise HTTPException(status_code=401, detail="Invalid email or password")
|
| 251 |
+
expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
|
| 252 |
+
to_encode = {"sub": user["_id"], "exp": expire}
|
| 253 |
+
token = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
|
| 254 |
+
return {"access_token": token, "token_type": "bearer"}
|
| 255 |
+
|
| 256 |
+
# To Add Document to RAG - MongoDB storage for both document and Chroma
|
| 257 |
+
@app.post("/build_rag", tags=["rag"])
|
| 258 |
+
async def build_rag_endpoint(file: UploadFile = File(...), user_id: str = QueryParam()):
|
| 259 |
+
if not file.filename.endswith(".docx"):
|
| 260 |
+
raise HTTPException(status_code=400, detail="Only .docx files are supported.")
|
| 261 |
+
|
| 262 |
+
try:
|
| 263 |
+
# Read the file into memory
|
| 264 |
+
file_content = await file.read()
|
| 265 |
+
file_size = len(file_content)
|
| 266 |
+
|
| 267 |
+
# Generate a unique document ID
|
| 268 |
+
doc_id = str(uuid.uuid4())
|
| 269 |
+
|
| 270 |
+
# Process the document for RAG
|
| 271 |
+
temp_file = io.BytesIO(file_content)
|
| 272 |
+
docs = load_and_split_bytes(temp_file)
|
| 273 |
+
|
| 274 |
+
# Store document in MongoDB
|
| 275 |
+
doc_info = {
|
| 276 |
+
"_id": doc_id,
|
| 277 |
+
"filename": file.filename,
|
| 278 |
+
"upload_date": datetime.utcnow(),
|
| 279 |
+
"file_size": file_size,
|
| 280 |
+
"chunks": len(docs),
|
| 281 |
+
"user_id": user_id,
|
| 282 |
+
"file_content": Binary(file_content) # Store as Binary BSON type
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
await documents_collection.insert_one(doc_info)
|
| 286 |
+
|
| 287 |
+
# Add to or update the Chroma vector store and save to MongoDB
|
| 288 |
+
collection_name = "default"
|
| 289 |
+
app.state.retriever = add_documents_to_index(docs, collection_name=collection_name)
|
| 290 |
+
|
| 291 |
+
# Get updated stats
|
| 292 |
+
stats = get_collection_stats(collection_name)
|
| 293 |
+
|
| 294 |
+
return {
|
| 295 |
+
"message": f"File '{file.filename}' added to knowledge base.",
|
| 296 |
+
"document_id": doc_id,
|
| 297 |
+
"total_chunks_in_db": stats["document_count"]
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
except Exception as e:
|
| 301 |
+
raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
|
| 302 |
+
|
| 303 |
+
# To query RAG
|
| 304 |
+
@app.post("/query_rag", tags=["rag"])
|
| 305 |
+
async def query_rag_endpoint(payload: QueryModel):
|
| 306 |
+
now = datetime.now(timezone.utc)
|
| 307 |
+
|
| 308 |
+
subscription = await subscriptions_collection.find_one({
|
| 309 |
+
"user_id": payload.user_id,
|
| 310 |
+
"status": "active"
|
| 311 |
+
})
|
| 312 |
+
|
| 313 |
+
if not subscription:
|
| 314 |
+
raise HTTPException(status_code=403, detail="No active subscription found.")
|
| 315 |
+
|
| 316 |
+
end_date = subscription.get("end_date")
|
| 317 |
+
if end_date and end_date.tzinfo is None:
|
| 318 |
+
end_date = end_date.replace(tzinfo=timezone.utc)
|
| 319 |
+
if end_date and now > end_date:
|
| 320 |
+
# Update status to expired
|
| 321 |
+
await subscriptions_collection.update_one(
|
| 322 |
+
{"_id": subscription["_id"]},
|
| 323 |
+
{"$set": {"status": "expired"}}
|
| 324 |
+
)
|
| 325 |
+
raise HTTPException(status_code=403, detail="Subscription expired.")
|
| 326 |
+
llm = get_llm()
|
| 327 |
+
cl_promp =f'''You are a binary intent classifier. When given a user message, decide if the user is asking to play or watch a video—explicitly or implicitly.
|
| 328 |
+
|
| 329 |
+
Instructions:
|
| 330 |
+
1. Analyze the provided message (`{payload.question}`) to determine intent:
|
| 331 |
+
- If the user intends to start, resume, or watch a video (e.g. “play the video now,” “let me see that demo,” “could you launch the tutorial clip?”), choose:
|
| 332 |
+
play_video
|
| 333 |
+
- Otherwise, choose:
|
| 334 |
+
other
|
| 335 |
+
2. Output exactly one label (play_video or other), with no additional text, punctuation, or formatting.
|
| 336 |
+
|
| 337 |
+
Input:
|
| 338 |
+
{{message}}
|
| 339 |
+
|
| 340 |
+
Output:'''
|
| 341 |
+
cl = llm.invoke(cl_promp).content
|
| 342 |
+
print(cl)
|
| 343 |
+
if cl == 'play_video':
|
| 344 |
+
try:
|
| 345 |
+
# 1) Fetch up to 1000 docs (just the link+description)
|
| 346 |
+
docs = await videos_collection.find(
|
| 347 |
+
{},
|
| 348 |
+
{"link": 1, "description": 1, "_id": 0}
|
| 349 |
+
).to_list(length=1000)
|
| 350 |
+
|
| 351 |
+
if not docs:
|
| 352 |
+
raise HTTPException(status_code=404, detail="No videos found")
|
| 353 |
+
|
| 354 |
+
# 2) Get descriptions as a list
|
| 355 |
+
descriptions = [doc["description"] for doc in docs]
|
| 356 |
+
|
| 357 |
+
# 3) Embed query using LangChain's OpenAIEmbeddings
|
| 358 |
+
query_embedding = embeddings_model.embed_query(payload.question)
|
| 359 |
+
|
| 360 |
+
# 4) Embed all descriptions (this is still inefficient but uses LangChain)
|
| 361 |
+
description_embeddings = embeddings_model.embed_documents(descriptions)
|
| 362 |
+
|
| 363 |
+
# 5) Compute similarities
|
| 364 |
+
similarities = [cosine_similarity(query_embedding, desc_emb)
|
| 365 |
+
for desc_emb in description_embeddings]
|
| 366 |
+
|
| 367 |
+
# 6) Pick best match
|
| 368 |
+
best_idx = int(np.argmax(similarities))
|
| 369 |
+
best_doc = docs[best_idx]
|
| 370 |
+
print(similarities[best_idx])
|
| 371 |
+
if similarities[best_idx] > 0.6:
|
| 372 |
+
return {
|
| 373 |
+
"answer" : "Here is video Based on your query",
|
| 374 |
+
"youtube": {
|
| 375 |
+
"embed_url": best_doc["link"],
|
| 376 |
+
"watch_url": ""
|
| 377 |
+
},
|
| 378 |
+
"transcript": ""
|
| 379 |
+
}
|
| 380 |
+
else:
|
| 381 |
+
return {
|
| 382 |
+
"answer" : "No Video found" ,
|
| 383 |
+
"youtube": {
|
| 384 |
+
"embed_url": "",
|
| 385 |
+
"watch_url": ""
|
| 386 |
+
},
|
| 387 |
+
"transcript": ""
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
except Exception as e:
|
| 391 |
+
logging.error(f"Error during video search: {str(e)}")
|
| 392 |
+
raise HTTPException(status_code=500, detail="Error processing search request")
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
# t_prompt = f'''You are a concise title generator. When given a user’s message, produce a short, keyword‑rich title that captures the core topic or intent—optimized for searching on YouTube.
|
| 397 |
+
# Instructions:
|
| 398 |
+
# 1. Read the provided message (`{payload.question}`).
|
| 399 |
+
# 2. Extract its main subject or action.
|
| 400 |
+
# 3. Craft a clear, descriptive title (5–8 words) suitable as a YouTube search query.
|
| 401 |
+
# 4. Output **exactly** the title, with no extra text or punctuation.
|
| 402 |
+
|
| 403 |
+
# Input:
|
| 404 |
+
# {{message}}
|
| 405 |
+
|
| 406 |
+
# Output:
|
| 407 |
+
# '''
|
| 408 |
+
# message_title = llm.invoke(t_prompt).content
|
| 409 |
+
# print(message_title)
|
| 410 |
+
# try:
|
| 411 |
+
# vid = search_youtube_video(message_title)
|
| 412 |
+
# try:
|
| 413 |
+
# transcript_data = YouTubeTranscriptApi.get_transcript(vid, languages=['en'])
|
| 414 |
+
# # transcript_data is a list of { "text": "...", "start": 12.34, "duration": 2.5 }
|
| 415 |
+
# except NoTranscriptFound:
|
| 416 |
+
# transcript_data = []
|
| 417 |
+
# except Exception:
|
| 418 |
+
# raise HTTPException(status_code=404, detail="Could not find a matching YouTube video.")
|
| 419 |
+
# embed_url = f"https://www.youtube.com/embed/{vid}"
|
| 420 |
+
# watch_url = f"https://www.youtube.com/watch?v={vid}"
|
| 421 |
+
# return {
|
| 422 |
+
# "answer": f"Sure—here’s what I found on YouTube for your request:",
|
| 423 |
+
# "youtube": {
|
| 424 |
+
# "embed_url": embed_url,
|
| 425 |
+
# "watch_url": watch_url
|
| 426 |
+
# },
|
| 427 |
+
# "transcript": transcript_data
|
| 428 |
+
# }
|
| 429 |
+
else:
|
| 430 |
+
# Use app.state.retriever which is initialized at startup or updated when docs are added
|
| 431 |
+
if not hasattr(app.state, "retriever") or app.state.retriever is None:
|
| 432 |
+
# Check if vector store exists but hasn't been loaded
|
| 433 |
+
retriever = get_existing_retriever()
|
| 434 |
+
if retriever:
|
| 435 |
+
app.state.retriever = retriever
|
| 436 |
+
else:
|
| 437 |
+
raise HTTPException(
|
| 438 |
+
status_code=400,
|
| 439 |
+
detail="No documents have been uploaded. Please upload documents using /build_rag first."
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
+
# Get user language preference
|
| 443 |
+
if language is None:
|
| 444 |
+
user_id = payload.user_id
|
| 445 |
+
user = await users_collection.find_one({"_id": user_id})
|
| 446 |
+
if user:
|
| 447 |
+
user_lan = user.get("language")
|
| 448 |
+
# else:
|
| 449 |
+
# user_lan = "Arabic" # Default to English
|
| 450 |
+
else:
|
| 451 |
+
user_lan = language
|
| 452 |
+
question_tokens = count_tokens(payload.question)
|
| 453 |
+
print(f"Question tokens: {question_tokens}")
|
| 454 |
+
retrieved_docs = app.state.retriever.get_relevant_documents(payload.question)
|
| 455 |
+
context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
|
| 456 |
+
context_tokens = count_tokens(context_text)
|
| 457 |
+
chat_history = await chatbot_history_collection.find_one({"userId": payload.user_id})
|
| 458 |
+
previous_messages = []
|
| 459 |
+
|
| 460 |
+
if chat_history and "messages" in chat_history:
|
| 461 |
+
history_limit = 10
|
| 462 |
+
recent_messages = chat_history["messages"][-history_limit:]
|
| 463 |
+
|
| 464 |
+
for msg in recent_messages:
|
| 465 |
+
previous_messages.append({"role": "human", "content": msg["user_message"]})
|
| 466 |
+
previous_messages.append({"role": "assistant", "content": msg["ai_response"]})
|
| 467 |
+
history_text = ""
|
| 468 |
+
for msg in previous_messages:
|
| 469 |
+
history_text += f"{msg['role']}: {msg['content']}\n"
|
| 470 |
+
|
| 471 |
+
history_tokens = count_tokens(history_text)
|
| 472 |
+
print(f"History tokens: {history_tokens}")
|
| 473 |
+
|
| 474 |
+
# Create RAG chain and generate response
|
| 475 |
+
llm = get_llm()
|
| 476 |
+
print("Diacritics : " , payload.diacritics)
|
| 477 |
+
print("Language : " ,user_lan)
|
| 478 |
+
rag_chain = create_rag_chain_with_history(app.state.retriever, llm, user_lan,payload.level ,payload.diacritics, previous_messages)
|
| 479 |
+
response = rag_chain.invoke({"input": payload.question})
|
| 480 |
+
response_tokens = count_tokens(response['answer'])
|
| 481 |
+
print(f"Response tokens: {response_tokens}")
|
| 482 |
+
print(f"Total tokens: {question_tokens + history_tokens + context_tokens}")
|
| 483 |
+
# Record chat history
|
| 484 |
+
msg_record = {
|
| 485 |
+
"user_message": payload.question,
|
| 486 |
+
"ai_response": response['answer'],
|
| 487 |
+
"timestamp": datetime.utcnow()
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
await chatbot_history_collection.update_one(
|
| 491 |
+
{"userId": payload.user_id},
|
| 492 |
+
{"$push": {"messages": msg_record}},
|
| 493 |
+
upsert=True
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
return {
|
| 497 |
+
"answer": response["answer"] ,
|
| 498 |
+
"youtube": {
|
| 499 |
+
"embed_url": "",
|
| 500 |
+
"watch_url": ""
|
| 501 |
+
},
|
| 502 |
+
"transcript": "" }
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
@app.post("/save_flashcard", tags=["flashcards"])
|
| 506 |
+
async def save_flashcard(payload: FlashcardSaveModel):
|
| 507 |
+
"""
|
| 508 |
+
Save a chat exchange as a flashcard with LLM-generated title.
|
| 509 |
+
"""
|
| 510 |
+
try:
|
| 511 |
+
# Generate title using LLM
|
| 512 |
+
# Get LLM instance
|
| 513 |
+
llm = get_llm()
|
| 514 |
+
|
| 515 |
+
# Create a prompt for title generation
|
| 516 |
+
title_prompt = f"""
|
| 517 |
+
Based on the following question and answer, generate a concise, descriptive title (15 words or less)
|
| 518 |
+
that captures the main topic or key insight. Make it specific enough that someone can understand
|
| 519 |
+
what information they'll find in this flashcard.
|
| 520 |
+
|
| 521 |
+
Question: {payload.question}
|
| 522 |
+
|
| 523 |
+
Answer: {payload.answer}
|
| 524 |
+
|
| 525 |
+
Title:
|
| 526 |
+
"""
|
| 527 |
+
|
| 528 |
+
# Generate title using LLM
|
| 529 |
+
title_response = llm.invoke(title_prompt)
|
| 530 |
+
|
| 531 |
+
# Clean up the response (remove any quotation marks, extra spaces, etc.)
|
| 532 |
+
title = title_response.content.strip().strip('"\'').strip()
|
| 533 |
+
|
| 534 |
+
# If LLM fails to generate a good title, fall back to question snippet
|
| 535 |
+
if not title or len(title) > 100:
|
| 536 |
+
title = payload.question[:50] + ("..." if len(payload.question) > 50 else "")
|
| 537 |
+
|
| 538 |
+
# Create the flashcard document
|
| 539 |
+
flashcard_doc = {
|
| 540 |
+
"_id": str(uuid.uuid4()),
|
| 541 |
+
"user_id": payload.user_id,
|
| 542 |
+
"title": title,
|
| 543 |
+
"question": payload.question,
|
| 544 |
+
"answer": payload.answer,
|
| 545 |
+
"created_at": datetime.utcnow()
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
# Insert into flash_cards collection
|
| 549 |
+
await flash_cards.insert_one(flashcard_doc)
|
| 550 |
+
|
| 551 |
+
return {
|
| 552 |
+
"success": True,
|
| 553 |
+
"flashcard_id": flashcard_doc["_id"],
|
| 554 |
+
"title": title, # Return the generated title to the frontend
|
| 555 |
+
"message": "Flashcard saved successfully"
|
| 556 |
+
}
|
| 557 |
+
except Exception as e:
|
| 558 |
+
print(f"Error saving flashcard: {str(e)}")
|
| 559 |
+
raise HTTPException(status_code=500, detail=f"Error saving flashcard: {str(e)}")
|
| 560 |
+
|
| 561 |
+
# Save Flash Cards
|
| 562 |
+
@app.get("/flashcards/{user_id}", tags=["flashcards"])
|
| 563 |
+
async def get_user_flashcards(user_id: str):
|
| 564 |
+
"""
|
| 565 |
+
Retrieve all flashcards for a specific user.
|
| 566 |
+
"""
|
| 567 |
+
try:
|
| 568 |
+
cursor = flash_cards.find({"user_id": user_id}).sort("created_at", -1)
|
| 569 |
+
flashcards = []
|
| 570 |
+
async for card in cursor:
|
| 571 |
+
# Convert ObjectId to string if needed
|
| 572 |
+
card["_id"] = str(card["_id"])
|
| 573 |
+
flashcards.append(card)
|
| 574 |
+
|
| 575 |
+
return {"user_id": user_id, "flashcards": flashcards}
|
| 576 |
+
except Exception as e:
|
| 577 |
+
raise HTTPException(status_code=500, detail=f"Error retrieving flashcards: {str(e)}")
|
| 578 |
+
|
| 579 |
+
|
| 580 |
+
# Delete FlashCards
|
| 581 |
+
@app.delete("/flashcards/{flashcard_id}", tags=["flashcards"])
|
| 582 |
+
async def delete_flashcard(flashcard_id: str, user_id: str = QueryParam()):
|
| 583 |
+
"""
|
| 584 |
+
Delete a specific flashcard. Requires user_id to verify ownership.
|
| 585 |
+
"""
|
| 586 |
+
try:
|
| 587 |
+
result = await flash_cards.delete_one({
|
| 588 |
+
"_id": flashcard_id,
|
| 589 |
+
"user_id": user_id
|
| 590 |
+
})
|
| 591 |
+
|
| 592 |
+
if result.deleted_count == 0:
|
| 593 |
+
raise HTTPException(status_code=404, detail="Flashcard not found or not owned by this user")
|
| 594 |
+
|
| 595 |
+
return {"success": True, "message": "Flashcard deleted successfully"}
|
| 596 |
+
except Exception as e:
|
| 597 |
+
raise HTTPException(status_code=500, detail=f"Error deleting flashcard: {str(e)}")
|
| 598 |
+
|
| 599 |
+
# For Video
|
| 600 |
+
|
| 601 |
+
@app.post("/add_video")
|
| 602 |
+
async def add_video(video: Video):
|
| 603 |
+
result = await videos_collection.insert_one({
|
| 604 |
+
"link": video.link,
|
| 605 |
+
"description": video.description
|
| 606 |
+
})
|
| 607 |
+
if result.inserted_id:
|
| 608 |
+
return {"message": "Video added successfully", "id": str(result.inserted_id)}
|
| 609 |
+
raise HTTPException(500, "Failed to add video")
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
# To Delete a document
|
| 614 |
+
@app.delete("/documents/{document_id}", tags=["rag"], status_code=status.HTTP_204_NO_CONTENT)
|
| 615 |
+
async def delete_document(document_id: str):
|
| 616 |
+
# 1. Delete the document metadata + file bytes
|
| 617 |
+
result = await documents_collection.delete_one({"_id": document_id})
|
| 618 |
+
if result.deleted_count == 0:
|
| 619 |
+
raise HTTPException(status_code=404, detail="Document not found")
|
| 620 |
+
|
| 621 |
+
# 2. Rebuild the RAG index from all remaining documents
|
| 622 |
+
# - Fetch all stored documents
|
| 623 |
+
cursor = documents_collection.find({})
|
| 624 |
+
all_docs = []
|
| 625 |
+
async for doc in cursor:
|
| 626 |
+
# load_and_split_bytes expects a file-like, so wrap bytes in BytesIO
|
| 627 |
+
from io import BytesIO
|
| 628 |
+
chunks = load_and_split_bytes(BytesIO(doc["file_content"]))
|
| 629 |
+
all_docs.extend(chunks)
|
| 630 |
+
|
| 631 |
+
# 3. If there are any chunks left, rebuild; otherwise clear retriever
|
| 632 |
+
if all_docs:
|
| 633 |
+
# This will re-create your Chroma index and persist it to Mongo
|
| 634 |
+
app.state.retriever = build_chroma_index(
|
| 635 |
+
all_docs,
|
| 636 |
+
collection_name="default"
|
| 637 |
+
)
|
| 638 |
+
else:
|
| 639 |
+
# No documents → clear both in-memory and persisted vector store
|
| 640 |
+
app.state.retriever = None
|
| 641 |
+
await chroma_db_collection.delete_one({"_id": "default"})
|
| 642 |
+
|
| 643 |
+
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
# To download a document
|
| 649 |
+
@app.get("/documents", tags=["rag"])
|
| 650 |
+
async def list_all_documents():
|
| 651 |
+
try:
|
| 652 |
+
cursor = documents_collection.find({})
|
| 653 |
+
documents = []
|
| 654 |
+
async for doc in cursor:
|
| 655 |
+
# Create a copy without the binary content
|
| 656 |
+
documents.append({
|
| 657 |
+
"id": str(doc["_id"]),
|
| 658 |
+
"filename": doc.get("filename"),
|
| 659 |
+
"upload_date": doc.get("upload_date"),
|
| 660 |
+
"file_size": doc.get("file_size"),
|
| 661 |
+
"chunks": doc.get("chunks"),
|
| 662 |
+
})
|
| 663 |
+
|
| 664 |
+
return {"documents": documents}
|
| 665 |
+
except Exception as e:
|
| 666 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 667 |
+
|
| 668 |
+
# To get Chat History
|
| 669 |
+
@app.get("/chat-history/{user_id}")
|
| 670 |
+
async def get_chat_history(user_id: str):
|
| 671 |
+
try:
|
| 672 |
+
cursor = chatbot_history_collection.find({"userId": user_id})
|
| 673 |
+
chat_history = []
|
| 674 |
+
async for doc in cursor:
|
| 675 |
+
# Convert ObjectId to string
|
| 676 |
+
doc["_id"] = str(doc["_id"])
|
| 677 |
+
chat_history.append(doc)
|
| 678 |
+
|
| 679 |
+
if not chat_history:
|
| 680 |
+
raise HTTPException(status_code=404, detail="No chat history found for this user")
|
| 681 |
+
|
| 682 |
+
return {"user_id": user_id, "chat_history": chat_history}
|
| 683 |
+
except Exception as e:
|
| 684 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 685 |
+
|
| 686 |
+
|
| 687 |
+
# To list uploaded documents
|
| 688 |
+
@app.get("/documents/user/{user_id}", tags=["rag"])
|
| 689 |
+
async def list_documents(user_id: str):
|
| 690 |
+
try:
|
| 691 |
+
cursor = documents_collection.find({"user_id": user_id})
|
| 692 |
+
documents = []
|
| 693 |
+
async for doc in cursor:
|
| 694 |
+
# Create a copy without the binary content
|
| 695 |
+
doc_info = {
|
| 696 |
+
"id": str(doc["_id"]),
|
| 697 |
+
"filename": doc["filename"],
|
| 698 |
+
"upload_date": doc["upload_date"],
|
| 699 |
+
"file_size": doc["file_size"],
|
| 700 |
+
"chunks": doc["chunks"],
|
| 701 |
+
}
|
| 702 |
+
documents.append(doc_info)
|
| 703 |
+
|
| 704 |
+
return {"user_id": user_id, "documents": documents}
|
| 705 |
+
except Exception as e:
|
| 706 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 707 |
+
|
| 708 |
+
# To get knowledge base info
|
| 709 |
+
@app.get("/knowledge-base-info", tags=["rag"])
|
| 710 |
+
async def get_knowledge_base_info():
|
| 711 |
+
stats = get_collection_stats()
|
| 712 |
+
return {
|
| 713 |
+
"documents_loaded": stats["exists"],
|
| 714 |
+
"total_chunks": stats["document_count"]
|
| 715 |
+
}
|
| 716 |
+
|
| 717 |
+
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002" , openai_api_key = "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")
|
| 718 |
+
|
| 719 |
+
@app.post("/search_video")
|
| 720 |
+
async def search_video(req: SearchRequest):
|
| 721 |
+
if not req.query.strip():
|
| 722 |
+
raise HTTPException(status_code=400, detail="Search query cannot be empty")
|
| 723 |
+
|
| 724 |
+
try:
|
| 725 |
+
# 1) Fetch up to 1000 docs (just the link+description)
|
| 726 |
+
docs = await videos_collection.find(
|
| 727 |
+
{},
|
| 728 |
+
{"link": 1, "description": 1, "_id": 0}
|
| 729 |
+
).to_list(length=1000)
|
| 730 |
+
|
| 731 |
+
if not docs:
|
| 732 |
+
raise HTTPException(status_code=404, detail="No videos found")
|
| 733 |
+
|
| 734 |
+
# 2) Get descriptions as a list
|
| 735 |
+
descriptions = [doc["description"] for doc in docs]
|
| 736 |
+
|
| 737 |
+
# 3) Embed query using LangChain's OpenAIEmbeddings
|
| 738 |
+
query_embedding = embeddings_model.embed_query(req.query)
|
| 739 |
+
|
| 740 |
+
# 4) Embed all descriptions (this is still inefficient but uses LangChain)
|
| 741 |
+
description_embeddings = embeddings_model.embed_documents(descriptions)
|
| 742 |
+
|
| 743 |
+
# 5) Compute similarities
|
| 744 |
+
similarities = [cosine_similarity(query_embedding, desc_emb)
|
| 745 |
+
for desc_emb in description_embeddings]
|
| 746 |
+
|
| 747 |
+
# 6) Pick best match
|
| 748 |
+
best_idx = int(np.argmax(similarities))
|
| 749 |
+
best_doc = docs[best_idx]
|
| 750 |
+
|
| 751 |
+
return {
|
| 752 |
+
"link": best_doc["link"],
|
| 753 |
+
"description": best_doc["description"],
|
| 754 |
+
"score": similarities[best_idx]
|
| 755 |
+
}
|
| 756 |
+
|
| 757 |
+
except Exception as e:
|
| 758 |
+
logging.error(f"Error during video search: {str(e)}")
|
| 759 |
+
raise HTTPException(status_code=500, detail="Error processing search request")
|
| 760 |
+
|
| 761 |
+
|
| 762 |
+
# @app.post("/start-trial/{user_id}", response_model=TrialResponse)
|
| 763 |
+
# async def start_trial(user_id: str):
|
| 764 |
+
# trial = await subscriptions_collection.find_one({
|
| 765 |
+
# "user_id": user_id,
|
| 766 |
+
# "plan": "trial"
|
| 767 |
+
# })
|
| 768 |
+
|
| 769 |
+
# if trial:
|
| 770 |
+
# # Optional: Mark expired trial as "expired" for clarity
|
| 771 |
+
# if trial["end_date"] < datetime.utcnow() and trial["status"] == "active":
|
| 772 |
+
# await subscriptions_collection.update_one(
|
| 773 |
+
# {"_id": trial["_id"]},
|
| 774 |
+
# {"$set": {"status": "expired", "updated_at": datetime.utcnow()}}
|
| 775 |
+
# )
|
| 776 |
+
# raise HTTPException(400, "You have already used your free trial. Please subscribe to continue.")
|
| 777 |
+
|
| 778 |
+
# now = datetime.utcnow()
|
| 779 |
+
# trial = {
|
| 780 |
+
# "user_id": user_id,
|
| 781 |
+
# "plan": "trial",
|
| 782 |
+
# "status": "active",
|
| 783 |
+
# "start_date": now,
|
| 784 |
+
# "end_date": now + timedelta(days=3),
|
| 785 |
+
# "created_at": now,
|
| 786 |
+
# "updated_at": now
|
| 787 |
+
# }
|
| 788 |
+
# await subscriptions_collection.insert_one(trial)
|
| 789 |
+
|
| 790 |
+
# return {"status": "trial started", "expires": trial["end_date"]}
|
| 791 |
+
|
| 792 |
+
async def has_valid_active_subscription(user_id: str) -> bool:
|
| 793 |
+
now = datetime.utcnow()
|
| 794 |
+
sub = await subscriptions_collection.find_one({
|
| 795 |
+
"user_id": user_id,
|
| 796 |
+
"status": "active",
|
| 797 |
+
"end_date": {"$gt": now}
|
| 798 |
+
})
|
| 799 |
+
return sub is not None
|
| 800 |
+
|
| 801 |
+
async def expire_old_subscriptions(user_id: str):
|
| 802 |
+
now = datetime.utcnow()
|
| 803 |
+
await subscriptions_collection.update_many(
|
| 804 |
+
{
|
| 805 |
+
"user_id": user_id,
|
| 806 |
+
"status": "active",
|
| 807 |
+
"end_date": {"$lte": now} # already expired
|
| 808 |
+
},
|
| 809 |
+
{"$set": {"status": "expired", "updated_at": now}}
|
| 810 |
+
)
|
| 811 |
+
|
| 812 |
+
@app.post("/start-trial/{user_id}", response_model=TrialResponse)
|
| 813 |
+
async def start_trial(user_id: str):
|
| 814 |
+
await expire_old_subscriptions(user_id)
|
| 815 |
+
|
| 816 |
+
if await has_valid_active_subscription(user_id):
|
| 817 |
+
raise HTTPException(400, "You already have an active subscription.")
|
| 818 |
+
|
| 819 |
+
now = datetime.utcnow()
|
| 820 |
+
trial = {
|
| 821 |
+
"user_id": user_id,
|
| 822 |
+
"plan": "trial",
|
| 823 |
+
"status": "active",
|
| 824 |
+
"start_date": now,
|
| 825 |
+
"end_date": now + timedelta(days=3),
|
| 826 |
+
"created_at": now,
|
| 827 |
+
"updated_at": now
|
| 828 |
+
}
|
| 829 |
+
await subscriptions_collection.insert_one(trial)
|
| 830 |
+
return {"status": "trial started", "expires": trial["end_date"].isoformat()}
|
| 831 |
+
|
| 832 |
+
|
| 833 |
+
@app.post("/create-subscription-order")
|
| 834 |
+
async def create_sub_order(sub_req: SubscriptionRequest): # Pass it from frontend (POST body or query param)
|
| 835 |
+
user_id = sub_req.user_id
|
| 836 |
+
plan = sub_req.plan
|
| 837 |
+
price_map = {
|
| 838 |
+
"monthly": "10.00",
|
| 839 |
+
"yearly": "100.00"
|
| 840 |
+
}
|
| 841 |
+
if plan not in price_map:
|
| 842 |
+
raise HTTPException(400, "Invalid plan selected")
|
| 843 |
+
price = price_map[plan]
|
| 844 |
+
request = OrdersCreateRequest()
|
| 845 |
+
request.prefer("return=representation")
|
| 846 |
+
request.request_body({
|
| 847 |
+
"intent": "CAPTURE",
|
| 848 |
+
"purchase_units": [{
|
| 849 |
+
"custom_id": user_id, # This can be any internal identifier
|
| 850 |
+
"amount": {
|
| 851 |
+
"currency_code": "USD",
|
| 852 |
+
"value": price
|
| 853 |
+
}
|
| 854 |
+
}],
|
| 855 |
+
"application_context": {
|
| 856 |
+
"return_url": f"http://localhost:3000/payments", # optional
|
| 857 |
+
"cancel_url": "http://localhost:3000/payments"
|
| 858 |
+
}
|
| 859 |
+
})
|
| 860 |
+
|
| 861 |
+
response = p_client.execute(request)
|
| 862 |
+
result = response.result
|
| 863 |
+
|
| 864 |
+
approve_url = next((link.href for link in result.links if link.rel == "approve"), None)
|
| 865 |
+
now = datetime.utcnow()
|
| 866 |
+
await orders_collection.insert_one({
|
| 867 |
+
"paypal_order_id": result.id,
|
| 868 |
+
"user_id": user_id,
|
| 869 |
+
"plan": plan,
|
| 870 |
+
"status": result.status,
|
| 871 |
+
"created_at": now,
|
| 872 |
+
"updated_at": now
|
| 873 |
+
})
|
| 874 |
+
|
| 875 |
+
return {
|
| 876 |
+
"order_id": result.id,
|
| 877 |
+
"status": result.status,
|
| 878 |
+
"approve_url": approve_url
|
| 879 |
+
}
|
| 880 |
+
|
| 881 |
+
|
| 882 |
+
@app.post("/capture-order/{order_id}", response_model=dict)
|
| 883 |
+
async def capture_order(order_id: str):
|
| 884 |
+
# ... your existing PayPal capture code here ...
|
| 885 |
+
req = OrdersCaptureRequest(order_id)
|
| 886 |
+
req.request_body({})
|
| 887 |
+
|
| 888 |
+
# ❌ Don't await this — it's synchronous
|
| 889 |
+
# resp = p_client.execute(req)
|
| 890 |
+
import asyncio
|
| 891 |
+
|
| 892 |
+
resp = await asyncio.to_thread(p_client.execute, req)
|
| 893 |
+
|
| 894 |
+
if resp.status_code != 201:
|
| 895 |
+
raise HTTPException(400, "Failed to capture order")
|
| 896 |
+
|
| 897 |
+
cap = resp.result
|
| 898 |
+
order_doc = await orders_collection.find_one({"paypal_order_id": order_id})
|
| 899 |
+
if not order_doc:
|
| 900 |
+
raise HTTPException(404, "Order not found")
|
| 901 |
+
|
| 902 |
+
# Expire old subscriptions before inserting new one
|
| 903 |
+
await expire_old_subscriptions(order_doc["user_id"])
|
| 904 |
+
|
| 905 |
+
if await has_valid_active_subscription(order_doc["user_id"]):
|
| 906 |
+
raise HTTPException(400, "You already have an active subscription.")
|
| 907 |
+
|
| 908 |
+
now = datetime.utcnow()
|
| 909 |
+
days = 30 if order_doc["plan"] == "monthly" else 365
|
| 910 |
+
sub = {
|
| 911 |
+
"user_id": order_doc["user_id"],
|
| 912 |
+
"plan": order_doc["plan"],
|
| 913 |
+
"status": "active",
|
| 914 |
+
"start_date": now,
|
| 915 |
+
"end_date": now + timedelta(days=days),
|
| 916 |
+
"paypal_order_id": order_id,
|
| 917 |
+
"created_at": now,
|
| 918 |
+
"updated_at": now
|
| 919 |
+
}
|
| 920 |
+
await subscriptions_collection.insert_one(sub)
|
| 921 |
+
|
| 922 |
+
# Update order status
|
| 923 |
+
await orders_collection.update_one(
|
| 924 |
+
{"paypal_order_id": order_id},
|
| 925 |
+
{"$set": {"status": cap.status}}
|
| 926 |
+
)
|
| 927 |
+
|
| 928 |
+
return {"status": cap.status}
|
| 929 |
+
|
| 930 |
+
|
| 931 |
+
# @app.post("/capture-order/{order_id}", response_model=dict)
|
| 932 |
+
# async def capture_order(order_id: str):
|
| 933 |
+
# req = OrdersCaptureRequest(order_id)
|
| 934 |
+
# req.request_body({})
|
| 935 |
+
|
| 936 |
+
# # ❌ Don't await this — it's synchronous
|
| 937 |
+
# # resp = p_client.execute(req)
|
| 938 |
+
# import asyncio
|
| 939 |
+
|
| 940 |
+
# resp = await asyncio.to_thread(p_client.execute, req)
|
| 941 |
+
|
| 942 |
+
# if resp.status_code != 201:
|
| 943 |
+
# raise HTTPException(400, "Failed to capture order")
|
| 944 |
+
|
| 945 |
+
# cap = resp.result
|
| 946 |
+
|
| 947 |
+
# order_doc = await orders_collection.find_one({"paypal_order_id": order_id})
|
| 948 |
+
# if not order_doc:
|
| 949 |
+
# raise HTTPException(404, "Order not found")
|
| 950 |
+
|
| 951 |
+
# await orders_collection.update_one(
|
| 952 |
+
# {"paypal_order_id": order_id},
|
| 953 |
+
# {"$set": {"status": cap.status}}
|
| 954 |
+
# )
|
| 955 |
+
|
| 956 |
+
# # Insert into subscriptions if plan exists
|
| 957 |
+
# if "plan" in order_doc:
|
| 958 |
+
# now = datetime.utcnow()
|
| 959 |
+
# days = 30 if order_doc["plan"] == "monthly" else 365
|
| 960 |
+
# sub = {
|
| 961 |
+
# "user_id": order_doc["user_id"],
|
| 962 |
+
# "plan": order_doc["plan"],
|
| 963 |
+
# "status": "active",
|
| 964 |
+
# "start_date": now,
|
| 965 |
+
# "end_date": now + timedelta(days=days),
|
| 966 |
+
# "paypal_order_id": order_id,
|
| 967 |
+
# "created_at": now,
|
| 968 |
+
# "updated_at": now
|
| 969 |
+
# }
|
| 970 |
+
# existing = await subscriptions_collection.find_one({"user_id": order_doc["user_id"]})
|
| 971 |
+
# if existing:
|
| 972 |
+
# # Optional: You can update the existing subscription instead of inserting
|
| 973 |
+
# await subscriptions_collection.update_one(
|
| 974 |
+
# {"user_id": order_doc["user_id"]},
|
| 975 |
+
# {"$set": {
|
| 976 |
+
# "plan": order_doc["plan"],
|
| 977 |
+
# "status": "active",
|
| 978 |
+
# "start_date": now,
|
| 979 |
+
# "end_date": now + timedelta(days=days),
|
| 980 |
+
# "paypal_order_id": order_id,
|
| 981 |
+
# "updated_at": now
|
| 982 |
+
# }}
|
| 983 |
+
# )
|
| 984 |
+
# else:
|
| 985 |
+
# await subscriptions_collection.insert_one(sub)
|
| 986 |
+
|
| 987 |
+
|
| 988 |
+
# return {"status": cap.status}
|
| 989 |
+
|
| 990 |
+
|
| 991 |
+
@app.post("/cancel", response_model=dict)
|
| 992 |
+
async def cancel_subscription(user_id: str):
|
| 993 |
+
subscription = await subscriptions_collection.find_one({
|
| 994 |
+
"user_id": user_id,
|
| 995 |
+
"status": "active"
|
| 996 |
+
})
|
| 997 |
+
|
| 998 |
+
if not subscription:
|
| 999 |
+
raise HTTPException(status_code=404, detail="Active subscription not found")
|
| 1000 |
+
|
| 1001 |
+
await subscriptions_collection.update_one(
|
| 1002 |
+
{"_id": subscription["_id"]},
|
| 1003 |
+
{"$set": {"status": "cancelled", "updated_at": datetime.utcnow()}}
|
| 1004 |
+
)
|
| 1005 |
+
|
| 1006 |
+
return {"status": "cancelled", "message": "Subscription cancelled successfully"}
|
| 1007 |
+
|
| 1008 |
+
|
| 1009 |
+
|
| 1010 |
+
|
| 1011 |
+
# @app.get("/payments")
|
| 1012 |
+
# async def success_page(user_id: str, token: str = "", PayerID: str = ""):
|
| 1013 |
+
# return {"message": "Payment approved", "user_id": user_id, "token": token, "PayerID": PayerID}
|
| 1014 |
+
|
| 1015 |
+
|
| 1016 |
+
@app.post("/webhook")
|
| 1017 |
+
async def paypal_webhook(request: Request):
|
| 1018 |
+
event = await request.json()
|
| 1019 |
+
if event.get("event_type") == "PAYMENT.CAPTURE.COMPLETED":
|
| 1020 |
+
rid = (event["resource"]
|
| 1021 |
+
.get("supplementary_data", {})
|
| 1022 |
+
.get("related_ids", {})
|
| 1023 |
+
.get("order_id"))
|
| 1024 |
+
if rid:
|
| 1025 |
+
await orders_collection.update_one(
|
| 1026 |
+
{"paypal_order_id": rid},
|
| 1027 |
+
{"$set": {"status": "COMPLETED"}}
|
| 1028 |
+
)
|
| 1029 |
+
return {"ok": True}
|
| 1030 |
+
|
| 1031 |
+
|
| 1032 |
+
|
| 1033 |
+
|
| 1034 |
+
|
| 1035 |
+
# To Health Check APP
|
| 1036 |
+
@app.get("/health", tags=["health"])
|
| 1037 |
+
async def health_check():
|
| 1038 |
+
try:
|
| 1039 |
+
await client.admin.command("ping")
|
| 1040 |
+
db_status = "connected"
|
| 1041 |
+
except Exception as e:
|
| 1042 |
+
db_status = f"error: {str(e)}"
|
| 1043 |
+
|
| 1044 |
+
# Check if vector store is accessible
|
| 1045 |
+
try:
|
| 1046 |
+
stats = get_collection_stats()
|
| 1047 |
+
vector_db_status = "connected" if stats["exists"] else "empty but ready"
|
| 1048 |
+
except Exception as e:
|
| 1049 |
+
vector_db_status = f"error: {str(e)}"
|
| 1050 |
+
|
| 1051 |
+
return {
|
| 1052 |
+
"status": "healthy",
|
| 1053 |
+
"database": db_status,
|
| 1054 |
+
"vector_database": vector_db_status,
|
| 1055 |
+
"storage_type": "mongodb"
|
| 1056 |
+
}
|
| 1057 |
+
|
| 1058 |
+
|
| 1059 |
+
# if __name__ == "__main__":
|
| 1060 |
+
# import uvicorn
|
| 1061 |
+
# uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8001)))
|
App/utils.py
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import pickle
|
| 4 |
+
from typing import List, Optional, BinaryIO, Dict, Any
|
| 5 |
+
from langchain_community.document_loaders import Docx2txtLoader
|
| 6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 8 |
+
from langchain_community.vectorstores import Chroma
|
| 9 |
+
from langchain_openai import ChatOpenAI
|
| 10 |
+
from langchain.chains import create_retrieval_chain
|
| 11 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
| 12 |
+
from langchain.prompts import ChatPromptTemplate
|
| 13 |
+
from langchain.schema import Document
|
| 14 |
+
import tempfile
|
| 15 |
+
from pymongo import MongoClient
|
| 16 |
+
from bson.binary import Binary
|
| 17 |
+
import uuid
|
| 18 |
+
import tiktoken
|
| 19 |
+
from googleapiclient.discovery import build
|
| 20 |
+
import numpy as np
|
| 21 |
+
|
| 22 |
+
# MongoDB connection
|
| 23 |
+
MONGODB_URI = os.getenv(
|
| 24 |
+
"MONGODB_URI",
|
| 25 |
+
"mongodb+srv://ahmed0499280:haseeb.2003@cluster0.hzgrxp2.mongodb.net/"
|
| 26 |
+
"?retryWrites=true&w=majority&appName=Cluster0"
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# MongoDB client
|
| 30 |
+
client = MongoClient(MONGODB_URI)
|
| 31 |
+
db = client["Cluster0"]
|
| 32 |
+
chroma_db_collection = db["chroma_db_store"] # Collection for storing Chroma DB
|
| 33 |
+
YOUTUBE_API_KEY = "AIzaSyDIaXWJJX2W8swWl093DMyNZ_7TZUGe3DI"
|
| 34 |
+
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
|
| 35 |
+
# Create a custom in-memory ChromaDB client
|
| 36 |
+
class MongoChromaStore:
|
| 37 |
+
"""Custom storage for Chroma that uses MongoDB instead of disk"""
|
| 38 |
+
|
| 39 |
+
@staticmethod
|
| 40 |
+
def save_chroma(chroma_db, collection_name="default"):
|
| 41 |
+
"""Save a Chroma DB to MongoDB"""
|
| 42 |
+
try:
|
| 43 |
+
# Extract necessary data from Chroma DB
|
| 44 |
+
# This is a simplification - in a real implementation we'd need to extract more data
|
| 45 |
+
embeddings = chroma_db._collection.get()
|
| 46 |
+
|
| 47 |
+
# Prepare data for MongoDB storage
|
| 48 |
+
chroma_data = {
|
| 49 |
+
"_id": collection_name,
|
| 50 |
+
"embeddings": Binary(pickle.dumps(embeddings)),
|
| 51 |
+
"last_updated": pickle.dumps(embeddings["metadatas"] if "metadatas" in embeddings else [])
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# Store or update in MongoDB
|
| 55 |
+
chroma_db_collection.replace_one(
|
| 56 |
+
{"_id": collection_name},
|
| 57 |
+
chroma_data,
|
| 58 |
+
upsert=True
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
return True
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"Error saving Chroma DB to MongoDB: {e}")
|
| 64 |
+
return False
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def count_tokens(text, model_name="gpt-3.5-turbo"):
|
| 69 |
+
"""Count tokens for a text string."""
|
| 70 |
+
try:
|
| 71 |
+
encoding = tiktoken.encoding_for_model(model_name)
|
| 72 |
+
except KeyError:
|
| 73 |
+
# Fall back to cl100k_base encoding if model not found
|
| 74 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
| 75 |
+
|
| 76 |
+
return len(encoding.encode(text))
|
| 77 |
+
|
| 78 |
+
# Video search
|
| 79 |
+
def search_youtube_video(query: str) -> str:
|
| 80 |
+
"""
|
| 81 |
+
Search YouTube for the top video matching `query` and return its videoId.
|
| 82 |
+
"""
|
| 83 |
+
res = (
|
| 84 |
+
youtube.search()
|
| 85 |
+
.list(
|
| 86 |
+
q=query,
|
| 87 |
+
part="id,snippet",
|
| 88 |
+
type="video",
|
| 89 |
+
maxResults=1
|
| 90 |
+
)
|
| 91 |
+
.execute()
|
| 92 |
+
)
|
| 93 |
+
items = res.get("items", [])
|
| 94 |
+
if not items:
|
| 95 |
+
raise ValueError("No video found for query.")
|
| 96 |
+
return items[0]["id"]["videoId"]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# Load and split DOCX into chunks (from file path)
|
| 100 |
+
def load_and_split(filepath: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Document]:
|
| 101 |
+
loader = Docx2txtLoader(filepath)
|
| 102 |
+
docs = loader.load()
|
| 103 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 104 |
+
return text_splitter.split_documents(docs)
|
| 105 |
+
|
| 106 |
+
# Load and split DOCX from bytes (for MongoDB storage)
|
| 107 |
+
def load_and_split_bytes(file_bytes: BinaryIO, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Document]:
|
| 108 |
+
# Create a temporary file
|
| 109 |
+
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_file:
|
| 110 |
+
temp_path = temp_file.name
|
| 111 |
+
|
| 112 |
+
# If file_bytes is a BytesIO object, get its content
|
| 113 |
+
if hasattr(file_bytes, 'read'):
|
| 114 |
+
file_bytes.seek(0)
|
| 115 |
+
content = file_bytes.read()
|
| 116 |
+
temp_file.write(content)
|
| 117 |
+
else:
|
| 118 |
+
# If it's already bytes
|
| 119 |
+
temp_file.write(file_bytes)
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
# Process the temp file
|
| 123 |
+
return load_and_split(temp_path, chunk_size, chunk_overlap)
|
| 124 |
+
finally:
|
| 125 |
+
# Clean up
|
| 126 |
+
if os.path.exists(temp_path):
|
| 127 |
+
os.remove(temp_path)
|
| 128 |
+
|
| 129 |
+
# Build Chroma index and save to MongoDB
|
| 130 |
+
def build_chroma_index(docs, embedding_model: str = "Omartificial-Intelligence-Space/GATE-AraBert-v1", collection_name: str = "default"):
|
| 131 |
+
# Create temporary directory for Chroma
|
| 132 |
+
temp_dir = tempfile.mkdtemp()
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
| 136 |
+
|
| 137 |
+
# Create or update Chroma DB
|
| 138 |
+
chroma_db = Chroma.from_documents(
|
| 139 |
+
docs,
|
| 140 |
+
embeddings,
|
| 141 |
+
persist_directory=temp_dir,
|
| 142 |
+
collection_name=collection_name
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
# Save Chroma DB to MongoDB
|
| 146 |
+
MongoChromaStore.save_chroma(chroma_db, collection_name)
|
| 147 |
+
|
| 148 |
+
# Return the retriever
|
| 149 |
+
return chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
|
| 150 |
+
|
| 151 |
+
finally:
|
| 152 |
+
# Clean up temporary directory
|
| 153 |
+
import shutil
|
| 154 |
+
if os.path.exists(temp_dir):
|
| 155 |
+
shutil.rmtree(temp_dir)
|
| 156 |
+
|
| 157 |
+
# Get existing Chroma DB from MongoDB
|
| 158 |
+
def get_existing_retriever(embedding_model: str = "Omartificial-Intelligence-Space/GATE-AraBert-v1", collection_name: str = "default"):
|
| 159 |
+
# Check if collection exists in MongoDB
|
| 160 |
+
chroma_data = chroma_db_collection.find_one({"_id": collection_name})
|
| 161 |
+
if not chroma_data:
|
| 162 |
+
return None
|
| 163 |
+
|
| 164 |
+
try:
|
| 165 |
+
# Create temporary directory for Chroma
|
| 166 |
+
temp_dir = tempfile.mkdtemp()
|
| 167 |
+
|
| 168 |
+
# Deserialize embeddings from MongoDB
|
| 169 |
+
embeddings_data = pickle.loads(chroma_data["embeddings"])
|
| 170 |
+
|
| 171 |
+
# Use the embeddings to recreate the Chroma DB
|
| 172 |
+
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
| 173 |
+
|
| 174 |
+
# At this point we would need to reconstruct the Chroma DB
|
| 175 |
+
# This is a simplified implementation that doesn't fully work
|
| 176 |
+
# In a production system, you would need a more complete solution
|
| 177 |
+
|
| 178 |
+
# For now, let's create a new Chroma DB and add the documents
|
| 179 |
+
# This is not ideal but shows the concept
|
| 180 |
+
if "documents" in embeddings_data and embeddings_data["documents"]:
|
| 181 |
+
# Create documents from the stored data
|
| 182 |
+
docs = []
|
| 183 |
+
for i, text in enumerate(embeddings_data["documents"]):
|
| 184 |
+
metadata = embeddings_data["metadatas"][i] if "metadatas" in embeddings_data else {}
|
| 185 |
+
docs.append(Document(page_content=text, metadata=metadata))
|
| 186 |
+
|
| 187 |
+
# Create a new Chroma DB
|
| 188 |
+
chroma_db = Chroma.from_documents(
|
| 189 |
+
docs,
|
| 190 |
+
embeddings,
|
| 191 |
+
persist_directory=temp_dir,
|
| 192 |
+
collection_name=collection_name
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
return chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
|
| 196 |
+
|
| 197 |
+
except Exception as e:
|
| 198 |
+
print(f"Error loading Chroma DB from MongoDB: {e}")
|
| 199 |
+
return None
|
| 200 |
+
finally:
|
| 201 |
+
# Clean up temporary directory
|
| 202 |
+
import shutil
|
| 203 |
+
if os.path.exists(temp_dir):
|
| 204 |
+
shutil.rmtree(temp_dir)
|
| 205 |
+
|
| 206 |
+
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
| 207 |
+
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
# Get document count in the collection
|
| 211 |
+
def get_collection_stats(collection_name: str = "default"):
|
| 212 |
+
# Check if collection exists in MongoDB
|
| 213 |
+
chroma_data = chroma_db_collection.find_one({"_id": collection_name})
|
| 214 |
+
if not chroma_data:
|
| 215 |
+
return {"exists": False, "document_count": 0}
|
| 216 |
+
|
| 217 |
+
try:
|
| 218 |
+
# Deserialize embeddings from MongoDB
|
| 219 |
+
embeddings_data = pickle.loads(chroma_data["embeddings"])
|
| 220 |
+
|
| 221 |
+
# Count documents
|
| 222 |
+
doc_count = len(embeddings_data["documents"]) if "documents" in embeddings_data else 0
|
| 223 |
+
|
| 224 |
+
return {
|
| 225 |
+
"exists": True,
|
| 226 |
+
"document_count": doc_count
|
| 227 |
+
}
|
| 228 |
+
except Exception as e:
|
| 229 |
+
print(f"Error getting collection stats: {e}")
|
| 230 |
+
return {"exists": False, "document_count": 0}
|
| 231 |
+
|
| 232 |
+
# Instantiate LLM (Google Gemini or OpenAI)
|
| 233 |
+
def get_llm(temperature: float = 0.0):
|
| 234 |
+
return ChatOpenAI(model="o4-mini", api_key="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")
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def create_rag_chain_with_history(retriever, llm, lan, level,diacritics=False, history=None ):
|
| 238 |
+
if history is None:
|
| 239 |
+
history = []
|
| 240 |
+
|
| 241 |
+
# Create the base system prompt based on language
|
| 242 |
+
if lan.lower() == 'arabic' and diacritics == True:
|
| 243 |
+
system_prompt = (
|
| 244 |
+
"You are an Assistant for answering questions. "
|
| 245 |
+
"Use the following retrieved context snippets to answer. "
|
| 246 |
+
"Look for the relevance between the context and the question before answering. "
|
| 247 |
+
"If you do not know the answer, say that you do not know. "
|
| 248 |
+
"Be polite, act like a teacher, and provide as detailed an answer as possible based on the context. "
|
| 249 |
+
"Consider the conversation history when responding. "
|
| 250 |
+
"You are designed to help Muslims learn Arabic, so explanations should be culturally respectful and appropriate. "
|
| 251 |
+
"Be responsive to the user's needs—if the user seems stuck or confused during the chat, proactively offer helpful suggestions, clarifications, or encouragement. "
|
| 252 |
+
"Adjust your explanations according to the student's level: "
|
| 253 |
+
"for 'beginner', use very simple language, break down grammar and context step-by-step, and give clear examples; "
|
| 254 |
+
"for 'intermediate', provide more detailed grammar and usage insights with moderate complexity; "
|
| 255 |
+
"for 'advanced', include deeper linguistic explanations, nuanced examples, and encourage self-reflection. "
|
| 256 |
+
"Grammar and contextual explanations should start at the appropriate level and build gradually. "
|
| 257 |
+
"Include examples from the connected knowledge base when possible; otherwise, generate clear and relevant examples yourself. "
|
| 258 |
+
f"Always respond in {lan} *with all proper diacritics*. "
|
| 259 |
+
f"Student level: {level}."
|
| 260 |
+
"{context}"
|
| 261 |
+
)
|
| 262 |
+
elif lan.lower() == 'arabic' and diacritics == False:
|
| 263 |
+
system_prompt = (
|
| 264 |
+
"You are an Assistant for answering questions. "
|
| 265 |
+
"Use the following retrieved context snippets to answer. "
|
| 266 |
+
"Look for the relevance between the context and the question before answering. "
|
| 267 |
+
"If you do not know the answer, say that you do not know. "
|
| 268 |
+
"Be polite, act like a teacher, and provide as detailed an answer as possible based on the context. "
|
| 269 |
+
"Consider the conversation history when responding. "
|
| 270 |
+
"You are designed to help Muslims learn Arabic, so explanations should be culturally respectful and appropriate. "
|
| 271 |
+
"Be responsive to the user's needs—if the user seems stuck or confused during the chat, proactively offer helpful suggestions, clarifications, or encouragement. "
|
| 272 |
+
"Adjust your explanations according to the student's level: "
|
| 273 |
+
"for 'beginner', use very simple language, break down grammar and context step-by-step, and give clear examples; "
|
| 274 |
+
"for 'intermediate', provide more detailed grammar and usage insights with moderate complexity; "
|
| 275 |
+
"for 'advanced', include deeper linguistic explanations, nuanced examples, and encourage self-reflection. "
|
| 276 |
+
"Grammar and contextual explanations should start at the appropriate level and build gradually. "
|
| 277 |
+
"Include examples from the connected knowledge base when possible; otherwise, generate clear and relevant examples yourself. "
|
| 278 |
+
f"Always respond in {lan} *without diacritics*"
|
| 279 |
+
f"Student level: {level}."
|
| 280 |
+
"{context}"
|
| 281 |
+
)
|
| 282 |
+
else:
|
| 283 |
+
system_prompt = (
|
| 284 |
+
"You are an Assistant for answering questions. "
|
| 285 |
+
"Use the following retrieved context snippets to answer. "
|
| 286 |
+
"Look for the relevance between the context and the question before answering. "
|
| 287 |
+
"If you do not know the answer, say that you do not know. "
|
| 288 |
+
"Be polite, act like a teacher, and provide as detailed an answer as possible based on the context. "
|
| 289 |
+
"Consider the conversation history when responding. "
|
| 290 |
+
"You are designed to help Muslims learn Arabic, so explanations should be culturally respectful and appropriate. "
|
| 291 |
+
"Be responsive to the user's needs—if the user seems stuck or confused during the chat, proactively offer helpful suggestions, clarifications, or encouragement. "
|
| 292 |
+
"Adjust your explanations according to the student's level: "
|
| 293 |
+
"for 'beginner', use very simple language, break down grammar and context step-by-step, and give clear examples; "
|
| 294 |
+
"for 'intermediate', provide more detailed grammar and usage insights with moderate complexity; "
|
| 295 |
+
"for 'advanced', include deeper linguistic explanations, nuanced examples, and encourage self-reflection. "
|
| 296 |
+
"Grammar and contextual explanations should start at the appropriate level and build gradually. "
|
| 297 |
+
"Include examples from the connected knowledge base when possible; otherwise, generate clear and relevant examples yourself. "
|
| 298 |
+
f"Always respond in {lan}. "
|
| 299 |
+
f"Student level: {level}. "
|
| 300 |
+
"{context}"
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# Create messages with history
|
| 305 |
+
messages = [('system', system_prompt)]
|
| 306 |
+
|
| 307 |
+
# Add conversation history
|
| 308 |
+
for message in history:
|
| 309 |
+
messages.append((message["role"], message["content"]))
|
| 310 |
+
|
| 311 |
+
# Add current user query
|
| 312 |
+
messages.append(('human', '{input}'))
|
| 313 |
+
|
| 314 |
+
prompt = ChatPromptTemplate.from_messages(messages)
|
| 315 |
+
question_answer_chain = create_stuff_documents_chain(llm, prompt)
|
| 316 |
+
|
| 317 |
+
return create_retrieval_chain(retriever, question_answer_chain)
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
# Additional function to add documents to existing index
|
| 323 |
+
def add_documents_to_index(docs, embedding_model: str = "Omartificial-Intelligence-Space/GATE-AraBert-v1", collection_name: str = "default"):
|
| 324 |
+
# Get existing retriever
|
| 325 |
+
existing_retriever = get_existing_retriever(embedding_model, collection_name)
|
| 326 |
+
|
| 327 |
+
# If no existing retriever, create a new one
|
| 328 |
+
if not existing_retriever:
|
| 329 |
+
return build_chroma_index(docs, embedding_model, collection_name)
|
| 330 |
+
|
| 331 |
+
# If we have an existing retriever, we need to add documents to it
|
| 332 |
+
# This is a simplified implementation
|
| 333 |
+
# In a production system, you would need a more complete solution
|
| 334 |
+
|
| 335 |
+
# Create temporary directory for Chroma
|
| 336 |
+
temp_dir = tempfile.mkdtemp()
|
| 337 |
+
|
| 338 |
+
try:
|
| 339 |
+
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
|
| 340 |
+
|
| 341 |
+
# Get existing documents
|
| 342 |
+
chroma_data = chroma_db_collection.find_one({"_id": collection_name})
|
| 343 |
+
if chroma_data:
|
| 344 |
+
embeddings_data = pickle.loads(chroma_data["embeddings"])
|
| 345 |
+
|
| 346 |
+
# Create documents from the stored data
|
| 347 |
+
existing_docs = []
|
| 348 |
+
if "documents" in embeddings_data and embeddings_data["documents"]:
|
| 349 |
+
for i, text in enumerate(embeddings_data["documents"]):
|
| 350 |
+
metadata = embeddings_data["metadatas"][i] if "metadatas" in embeddings_data else {}
|
| 351 |
+
existing_docs.append(Document(page_content=text, metadata=metadata))
|
| 352 |
+
|
| 353 |
+
# Combine with new documents
|
| 354 |
+
all_docs = existing_docs + docs
|
| 355 |
+
|
| 356 |
+
# Create a new Chroma DB with all documents
|
| 357 |
+
chroma_db = Chroma.from_documents(
|
| 358 |
+
all_docs,
|
| 359 |
+
embeddings,
|
| 360 |
+
persist_directory=temp_dir,
|
| 361 |
+
collection_name=collection_name
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
# Save Chroma DB to MongoDB
|
| 365 |
+
MongoChromaStore.save_chroma(chroma_db, collection_name)
|
| 366 |
+
|
| 367 |
+
return chroma_db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
|
| 368 |
+
else:
|
| 369 |
+
# If no existing data, create a new index
|
| 370 |
+
return build_chroma_index(docs, embedding_model, collection_name)
|
| 371 |
+
|
| 372 |
+
finally:
|
| 373 |
+
# Clean up temporary directory
|
| 374 |
+
import shutil
|
| 375 |
+
if os.path.exists(temp_dir):
|
| 376 |
+
shutil.rmtree(temp_dir)
|
Dockerfile
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python image
|
| 2 |
+
FROM python:3.10
|
| 3 |
+
# Set the working directory
|
| 4 |
+
WORKDIR /code
|
| 5 |
+
# Copy files
|
| 6 |
+
COPY requirements.txt .
|
| 7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 8 |
+
COPY ./app ./app
|
| 9 |
+
# Expose port
|
| 10 |
+
EXPOSE 7860
|
| 11 |
+
# Run the FastAPI app using uvicorn
|
| 12 |
+
CMD ["uvicorn", "App.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiohappyeyeballs==2.6.1
|
| 2 |
+
aiohttp==3.11.18
|
| 3 |
+
aiosignal==1.3.2
|
| 4 |
+
annotated-types==0.7.0
|
| 5 |
+
anyio==4.9.0
|
| 6 |
+
asgiref==3.8.1
|
| 7 |
+
attrs==25.3.0
|
| 8 |
+
backoff==2.2.1
|
| 9 |
+
bcrypt==4.3.0
|
| 10 |
+
build==1.2.2.post1
|
| 11 |
+
cachetools==5.5.2
|
| 12 |
+
certifi==2025.4.26
|
| 13 |
+
cffi==1.17.1
|
| 14 |
+
charset-normalizer==3.4.2
|
| 15 |
+
chromadb==1.0.8
|
| 16 |
+
click==8.1.8
|
| 17 |
+
coloredlogs==15.0.1
|
| 18 |
+
cryptography==44.0.3
|
| 19 |
+
dataclasses-json==0.6.7
|
| 20 |
+
defusedxml==0.7.1
|
| 21 |
+
Deprecated==1.2.18
|
| 22 |
+
distro==1.9.0
|
| 23 |
+
dnspython==2.7.0
|
| 24 |
+
docx2txt==0.9
|
| 25 |
+
durationpy==0.9
|
| 26 |
+
ecdsa==0.19.1
|
| 27 |
+
email_validator==2.2.0
|
| 28 |
+
fastapi==0.115.9
|
| 29 |
+
filelock==3.18.0
|
| 30 |
+
filetype==1.2.0
|
| 31 |
+
flatbuffers==25.2.10
|
| 32 |
+
frozenlist==1.6.0
|
| 33 |
+
fsspec==2025.3.2
|
| 34 |
+
google-ai-generativelanguage==0.6.18
|
| 35 |
+
google-api-core==2.24.2
|
| 36 |
+
google-api-python-client==2.169.0
|
| 37 |
+
google-auth==2.40.1
|
| 38 |
+
google-auth-httplib2==0.2.0
|
| 39 |
+
googleapis-common-protos==1.70.0
|
| 40 |
+
greenlet==3.2.1
|
| 41 |
+
grpcio==1.71.0
|
| 42 |
+
grpcio-status==1.71.0
|
| 43 |
+
h11==0.16.0
|
| 44 |
+
httpcore==1.0.9
|
| 45 |
+
httplib2==0.22.0
|
| 46 |
+
httptools==0.6.4
|
| 47 |
+
httpx==0.28.1
|
| 48 |
+
httpx-sse==0.4.0
|
| 49 |
+
huggingface-hub==0.30.2
|
| 50 |
+
humanfriendly==10.0
|
| 51 |
+
idna==3.10
|
| 52 |
+
importlib_metadata==8.6.1
|
| 53 |
+
importlib_resources==6.5.2
|
| 54 |
+
Jinja2==3.1.6
|
| 55 |
+
jiter==0.9.0
|
| 56 |
+
joblib==1.5.0
|
| 57 |
+
jsonpatch==1.33
|
| 58 |
+
jsonpointer==3.0.0
|
| 59 |
+
jsonschema==4.23.0
|
| 60 |
+
jsonschema-specifications==2025.4.1
|
| 61 |
+
jwt==1.3.1
|
| 62 |
+
kubernetes==32.0.1
|
| 63 |
+
langchain==0.3.25
|
| 64 |
+
langchain-community==0.3.23
|
| 65 |
+
langchain-core==0.3.58
|
| 66 |
+
langchain-google-genai==2.1.4
|
| 67 |
+
langchain-huggingface==0.1.2
|
| 68 |
+
langchain-openai==0.3.16
|
| 69 |
+
langchain-text-splitters==0.3.8
|
| 70 |
+
langsmith==0.3.42
|
| 71 |
+
markdown-it-py==3.0.0
|
| 72 |
+
MarkupSafe==3.0.2
|
| 73 |
+
marshmallow==3.26.1
|
| 74 |
+
mdurl==0.1.2
|
| 75 |
+
mmh3==5.1.0
|
| 76 |
+
motor==3.7.0
|
| 77 |
+
mpmath==1.3.0
|
| 78 |
+
multidict==6.4.3
|
| 79 |
+
mypy_extensions==1.1.0
|
| 80 |
+
networkx==3.4.2
|
| 81 |
+
numpy==2.2.5
|
| 82 |
+
nvidia-cublas-cu12==12.6.4.1
|
| 83 |
+
nvidia-cuda-cupti-cu12==12.6.80
|
| 84 |
+
nvidia-cuda-nvrtc-cu12==12.6.77
|
| 85 |
+
nvidia-cuda-runtime-cu12==12.6.77
|
| 86 |
+
nvidia-cudnn-cu12==9.5.1.17
|
| 87 |
+
nvidia-cufft-cu12==11.3.0.4
|
| 88 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 89 |
+
nvidia-curand-cu12==10.3.7.77
|
| 90 |
+
nvidia-cusolver-cu12==11.7.1.2
|
| 91 |
+
nvidia-cusparse-cu12==12.5.4.2
|
| 92 |
+
nvidia-cusparselt-cu12==0.6.3
|
| 93 |
+
nvidia-nccl-cu12==2.26.2
|
| 94 |
+
nvidia-nvjitlink-cu12==12.6.85
|
| 95 |
+
nvidia-nvtx-cu12==12.6.77
|
| 96 |
+
oauthlib==3.2.2
|
| 97 |
+
onnxruntime==1.21.1
|
| 98 |
+
openai==1.77.0
|
| 99 |
+
opentelemetry-api==1.32.1
|
| 100 |
+
opentelemetry-exporter-otlp-proto-common==1.32.1
|
| 101 |
+
opentelemetry-exporter-otlp-proto-grpc==1.32.1
|
| 102 |
+
opentelemetry-instrumentation==0.53b1
|
| 103 |
+
opentelemetry-instrumentation-asgi==0.53b1
|
| 104 |
+
opentelemetry-instrumentation-fastapi==0.53b1
|
| 105 |
+
opentelemetry-proto==1.32.1
|
| 106 |
+
opentelemetry-sdk==1.32.1
|
| 107 |
+
opentelemetry-semantic-conventions==0.53b1
|
| 108 |
+
opentelemetry-util-http==0.53b1
|
| 109 |
+
orjson==3.10.18
|
| 110 |
+
overrides==7.7.0
|
| 111 |
+
packaging==24.2
|
| 112 |
+
passlib==1.7.4
|
| 113 |
+
paypal-checkout-serversdk==1.0.3
|
| 114 |
+
paypalhttp==1.0.1
|
| 115 |
+
pillow==11.2.1
|
| 116 |
+
posthog==4.0.1
|
| 117 |
+
propcache==0.3.1
|
| 118 |
+
proto-plus==1.26.1
|
| 119 |
+
protobuf==5.29.4
|
| 120 |
+
pyasn1==0.6.1
|
| 121 |
+
pyasn1_modules==0.4.2
|
| 122 |
+
pycparser==2.22
|
| 123 |
+
pydantic==2.11.4
|
| 124 |
+
pydantic-settings==2.9.1
|
| 125 |
+
pydantic_core==2.33.2
|
| 126 |
+
Pygments==2.19.1
|
| 127 |
+
pymongo==4.12.1
|
| 128 |
+
pyOpenSSL==25.1.0
|
| 129 |
+
pyparsing==3.2.3
|
| 130 |
+
PyPika==0.48.9
|
| 131 |
+
pyproject_hooks==1.2.0
|
| 132 |
+
python-dateutil==2.9.0.post0
|
| 133 |
+
python-dotenv==1.1.0
|
| 134 |
+
python-jose==3.4.0
|
| 135 |
+
python-multipart==0.0.20
|
| 136 |
+
PyYAML==6.0.2
|
| 137 |
+
referencing==0.36.2
|
| 138 |
+
regex==2024.11.6
|
| 139 |
+
requests==2.32.3
|
| 140 |
+
requests-oauthlib==2.0.0
|
| 141 |
+
requests-toolbelt==1.0.0
|
| 142 |
+
rich==14.0.0
|
| 143 |
+
rpds-py==0.24.0
|
| 144 |
+
rsa==4.9.1
|
| 145 |
+
safetensors==0.5.3
|
| 146 |
+
scikit-learn==1.6.1
|
| 147 |
+
scipy==1.15.2
|
| 148 |
+
sentence-transformers==4.1.0
|
| 149 |
+
setuptools==80.3.1
|
| 150 |
+
shellingham==1.5.4
|
| 151 |
+
six==1.17.0
|
| 152 |
+
sniffio==1.3.1
|
| 153 |
+
SQLAlchemy==2.0.40
|
| 154 |
+
starlette==0.45.3
|
| 155 |
+
sympy==1.14.0
|
| 156 |
+
tenacity==9.1.2
|
| 157 |
+
threadpoolctl==3.6.0
|
| 158 |
+
tiktoken==0.9.0
|
| 159 |
+
tokenizers==0.21.1
|
| 160 |
+
torch==2.7.0
|
| 161 |
+
tqdm==4.67.1
|
| 162 |
+
transformers==4.51.3
|
| 163 |
+
triton==3.3.0
|
| 164 |
+
typer==0.15.3
|
| 165 |
+
typing-inspect==0.9.0
|
| 166 |
+
typing-inspection==0.4.0
|
| 167 |
+
typing_extensions==4.13.2
|
| 168 |
+
uritemplate==4.1.1
|
| 169 |
+
urllib3==2.4.0
|
| 170 |
+
uvicorn==0.34.2
|
| 171 |
+
uvloop==0.21.0
|
| 172 |
+
watchfiles==1.0.5
|
| 173 |
+
websocket-client==1.8.0
|
| 174 |
+
websockets==15.0.1
|
| 175 |
+
wrapt==1.17.2
|
| 176 |
+
yarl==1.20.0
|
| 177 |
+
youtube-transcript-api==1.0.3
|
| 178 |
+
zipp==3.21.0
|
| 179 |
+
zstandard==0.23.0
|