Spaces:

NAVARASA
/

chathur_api

Sleeping

App Files Files Community

VJnCode commited on Sep 2, 2025

Commit

1c3cab3

1 Parent(s): ce60e7f

added klang to routes

Browse files

Files changed (8) hide show

api/main.py +48 -35
api/rag/IndicTrans2 +1 -0
api/rag/IndicTransToolkit +1 -0
api/rag/rag.ipynb +0 -0
api/rag/trail.ipynb +634 -0
api/rag/translator.ipynb +524 -0
api/routes/endpoints.py +35 -24
api/services/scheme_service.py +118 -31

api/main.py CHANGED Viewed

@@ -1,70 +1,83 @@
 from fastapi import FastAPI, HTTPException, status
-import asyncio
 import logging
-from api.routes import endpoints # Changed import path
-from api.core.firebase_utils import db, initialize_firebase # Changed import path
-from api.services.scheme_service import load_all_schemes_into_cache, is_cache_loading, cached_all_schemes
-from api.routes import rag_route
 from fastapi.middleware.cors import CORSMiddleware
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
     allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
-# --- Application Startup Event ---
-@app.on_event("startup")
-async def startup_event():
-    """
-    Called when the FastAPI application starts.
-    Initializes Firebase and initiates the loading of schemes into the cache.
-    """
-    initialize_firebase()
-    # Start cache loading in the background
-    asyncio.create_task(load_all_schemes_into_cache())
-    logger.info("Application startup: Initiated cache loading.")
-# --- API Endpoints (include routers) ---
-app.include_router(endpoints.router)
 app.include_router(rag_route.router, prefix="/api", tags=["RAG Chatbot"])
 @app.get("/")
 def root():
     """Welcome message for the API."""
     return {"message": "Welcome to Chathur API"}
-# Optional: You might still want to expose cache status or trigger refresh directly from main,
-# or keep them within the service layer and expose through a router.
-@app.get("/cache_status")
 def get_cache_status():
     """Returns the current status of the scheme cache."""
     return {
-        "cache_loaded": bool(cached_all_schemes),
-        "cache_loading": is_cache_loading,
         "states_in_cache": len(cached_all_schemes)
     }
-@app.post("/schemes/refresh_cache")
 async def refresh_schemes_cache():
     """
-    Manually triggers a refresh of the in-memory schemes cache from Firestore.
-    Use this endpoint if your Firestore data changes and you need the API to reflect it immediately.
     """
-    if is_cache_loading:
         raise HTTPException(
             status_code=status.HTTP_409_CONFLICT,
             detail="Cache refresh already in progress."
         )
-    asyncio.create_task(load_all_schemes_into_cache()) # Trigger in background
-    return {"message": "Schemes cache refresh initiated. It will be updated shortly."}

+# main.py
 from fastapi import FastAPI, HTTPException, status
+from contextlib import asynccontextmanager
 import logging
+from api.routes import endpoints, rag_route
+from api.core.firebase_utils import initialize_firebase
+from api.services.scheme_service import load_all_schemes_into_cache, is_cache_loading, cached_all_schemes
 from fastapi.middleware.cors import CORSMiddleware
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# --- Lifespan Manager for Startup/Shutdown Events ---
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Handles application startup and shutdown events.
+    """
+    # === Code to run on startup ===
+    logger.info("Application startup sequence initiated...")
+    initialize_firebase()
+    # 'await' ensures this task completes BEFORE the application starts accepting requests.
+    await load_all_schemes_into_cache()
+    logger.info("Application startup complete: Firebase initialized and cache fully loaded.")
+    yield  # The application is now running
+    # === Code to run on shutdown (optional) ===
+    logger.info("Application shutting down.")
+# Create the FastAPI app instance with the lifespan manager
+app = FastAPI(
+    title="Chathur API",
+    description="API for government schemes and RAG chatbot.",
+    lifespan=lifespan
+)
+# --- Middleware ---
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],  # Allows all origins
     allow_credentials=True,
+    allow_methods=["*"],  # Allows all methods
+    allow_headers=["*"],  # Allows all headers
 )
+# --- API Endpoints (Routers) ---
+# Note: The 'endpoints.router' does not have a prefix, so its routes are at the root.
+app.include_router(endpoints.router)
 app.include_router(rag_route.router, prefix="/api", tags=["RAG Chatbot"])
+# --- Root and Utility Endpoints ---
 @app.get("/")
 def root():
     """Welcome message for the API."""
     return {"message": "Welcome to Chathur API"}
+@app.get("/cache-status", tags=["Cache Management"])
 def get_cache_status():
     """Returns the current status of the scheme cache."""
     return {
+        "is_cache_loading": is_cache_loading(),
+        "is_cache_populated": bool(cached_all_schemes),
         "states_in_cache": len(cached_all_schemes)
     }
+@app.post("/schemes/refresh-cache", status_code=status.HTTP_202_ACCEPTED, tags=["Cache Management"])
 async def refresh_schemes_cache():
     """
+    Manually triggers a background refresh of the in-memory schemes cache.
     """
+    if is_cache_loading():
         raise HTTPException(
             status_code=status.HTTP_409_CONFLICT,
             detail="Cache refresh already in progress."
         )
+    # create_task is appropriate here because it's a manual trigger;
+    # we want to return a response immediately, not wait for the refresh.
+    asyncio.create_task(load_all_schemes_into_cache())
+    return {"message": "Schemes cache refresh initiated in the background."}

api/rag/IndicTrans2 ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 53fd3e9df8ca5a5fc9d92f45027959f0b0e0b14f

api/rag/IndicTransToolkit ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 3efb8418d0721b4ce267c2b3586899d313191357

api/rag/rag.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

api/rag/trail.ipynb ADDED Viewed

	@@ -0,0 +1,634 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "5c1018e2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[WinError 3] The system cannot find the path specified: '/content/IndicTrans2/huggingface_interface'\n",
+      "d:\\Major Project\\Chathur\\Bakend_HuggingFace\\api\\rag\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "fatal: destination path 'IndicTrans2' already exists and is not an empty directory.\n"
+     ]
+    }
+   ],
+   "source": [
+    "!git clone https://github.com/AI4Bharat/IndicTrans2.git\n",
+    "%cd /content/IndicTrans2/huggingface_interface"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b4190411",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to\n",
+      "[nltk_data]     C:\\Users\\Hp\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: bitsandbytes in d:\\major project\\chathur\\.venv\\lib\\site-packages (0.47.0)\n",
+      "Requirement already satisfied: scipy in d:\\major project\\chathur\\.venv\\lib\\site-packages (1.16.1)\n",
+      "Requirement already satisfied: accelerate in d:\\major project\\chathur\\.venv\\lib\\site-packages (1.10.1)\n",
+      "Requirement already satisfied: datasets in d:\\major project\\chathur\\.venv\\lib\\site-packages (4.0.0)\n",
+      "Requirement already satisfied: torch<3,>=2.2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from bitsandbytes) (2.8.0)\n",
+      "Requirement already satisfied: numpy>=1.17 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from bitsandbytes) (2.3.2)\n",
+      "Requirement already satisfied: filelock in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (3.19.1)\n",
+      "Requirement already satisfied: typing-extensions>=4.10.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (4.15.0)\n",
+      "Requirement already satisfied: sympy>=1.13.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (1.14.0)\n",
+      "Requirement already satisfied: networkx in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (3.5)\n",
+      "Requirement already satisfied: jinja2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (3.1.6)\n",
+      "Requirement already satisfied: fsspec in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (2025.3.0)\n",
+      "Requirement already satisfied: setuptools in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (80.9.0)\n",
+      "Requirement already satisfied: packaging>=20.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from accelerate) (24.2)\n",
+      "Requirement already satisfied: psutil in d:\\major project\\chathur\\.venv\\lib\\site-packages (from accelerate) (7.0.0)\n",
+      "Requirement already satisfied: pyyaml in d:\\major project\\chathur\\.venv\\lib\\site-packages (from accelerate) (6.0.2)\n",
+      "Requirement already satisfied: huggingface_hub>=0.21.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from accelerate) (0.34.4)\n",
+      "Requirement already satisfied: safetensors>=0.4.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from accelerate) (0.6.2)\n",
+      "Requirement already satisfied: pyarrow>=15.0.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (21.0.0)\n",
+      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (0.3.8)\n",
+      "Requirement already satisfied: pandas in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (2.3.2)\n",
+      "Requirement already satisfied: requests>=2.32.2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (2.32.5)\n",
+      "Requirement already satisfied: tqdm>=4.66.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (4.67.1)\n",
+      "Requirement already satisfied: xxhash in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (3.5.0)\n",
+      "Requirement already satisfied: multiprocess<0.70.17 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (0.70.16)\n",
+      "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.12.15)\n",
+      "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2.6.1)\n",
+      "Requirement already satisfied: aiosignal>=1.4.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.4.0)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (25.3.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.7.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (6.6.4)\n",
+      "Requirement already satisfied: propcache>=0.2.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (0.3.2)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.17.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.20.1)\n",
+      "Requirement already satisfied: idna>=2.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from yarl<2.0,>=1.17.0->aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.10)\n",
+      "Requirement already satisfied: charset_normalizer<4,>=2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.32.2->datasets) (3.4.3)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.32.2->datasets) (2.5.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.32.2->datasets) (2025.8.3)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sympy>=1.13.3->torch<3,>=2.2->bitsandbytes) (1.3.0)\n",
+      "Requirement already satisfied: colorama in d:\\major project\\chathur\\.venv\\lib\\site-packages (from tqdm>=4.66.3->datasets) (0.4.6)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from jinja2->torch<3,>=2.2->bitsandbytes) (3.0.2)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->datasets) (2.9.0.post0)\n",
+      "Requirement already satisfied: pytz>=2020.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->datasets) (2025.2)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->datasets) (2025.2)\n",
+      "Requirement already satisfied: six>=1.5 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n",
+      "Requirement already satisfied: sentencepiece in d:\\major project\\chathur\\.venv\\lib\\site-packages (0.2.1)\n",
+      "d:\\Major Project\\Chathur\\Bakend_HuggingFace\\api\\rag\\IndicTransToolkit\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "fatal: destination path 'IndicTransToolkit' already exists and is not an empty directory.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Obtaining file:///D:/Major%20Project/Chathur/Bakend_HuggingFace/api/rag/IndicTransToolkit\n",
+      "  Installing build dependencies: started\n",
+      "  Installing build dependencies: finished with status 'done'\n",
+      "  Checking if build backend supports build_editable: started\n",
+      "  Checking if build backend supports build_editable: finished with status 'done'\n",
+      "  Getting requirements to build editable: started\n",
+      "  Getting requirements to build editable: finished with status 'done'\n",
+      "  Preparing editable metadata (pyproject.toml): started\n",
+      "  Preparing editable metadata (pyproject.toml): finished with status 'done'\n",
+      "Requirement already satisfied: cython in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indictranstoolkit==1.1.1) (3.1.3)\n",
+      "Requirement already satisfied: sacremoses in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indictranstoolkit==1.1.1) (0.1.1)\n",
+      "Requirement already satisfied: transformers in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indictranstoolkit==1.1.1) (4.55.4)\n",
+      "Requirement already satisfied: sacrebleu in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indictranstoolkit==1.1.1) (2.5.1)\n",
+      "Requirement already satisfied: indic-nlp-library-itt in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indictranstoolkit==1.1.1) (0.1.1)\n",
+      "Requirement already satisfied: morfessor in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.0.6)\n",
+      "Requirement already satisfied: numpy in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.3.2)\n",
+      "Requirement already satisfied: pandas in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.3.2)\n",
+      "Requirement already satisfied: sphinx-argparse in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indic-nlp-library-itt->indictranstoolkit==1.1.1) (0.5.2)\n",
+      "Requirement already satisfied: sphinx-rtd-theme in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.0.2)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.9.0.post0)\n",
+      "Requirement already satisfied: pytz>=2020.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2025.2)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2025.2)\n",
+      "Requirement already satisfied: six>=1.5 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->indic-nlp-library-itt->indictranstoolkit==1.1.1) (1.17.0)\n",
+      "Requirement already satisfied: portalocker in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacrebleu->indictranstoolkit==1.1.1) (3.2.0)\n",
+      "Requirement already satisfied: regex in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacrebleu->indictranstoolkit==1.1.1) (2025.7.34)\n",
+      "Requirement already satisfied: tabulate>=0.8.9 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacrebleu->indictranstoolkit==1.1.1) (0.9.0)\n",
+      "Requirement already satisfied: colorama in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacrebleu->indictranstoolkit==1.1.1) (0.4.6)\n",
+      "Requirement already satisfied: lxml in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacrebleu->indictranstoolkit==1.1.1) (6.0.1)\n",
+      "Requirement already satisfied: pywin32>=226 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from portalocker->sacrebleu->indictranstoolkit==1.1.1) (311)\n",
+      "Requirement already satisfied: click in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacremoses->indictranstoolkit==1.1.1) (8.2.1)\n",
+      "Requirement already satisfied: joblib in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacremoses->indictranstoolkit==1.1.1) (1.5.2)\n",
+      "Requirement already satisfied: tqdm in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacremoses->indictranstoolkit==1.1.1) (4.67.1)\n",
+      "Requirement already satisfied: sphinx>=5.1.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (8.2.3)\n",
+      "Requirement already satisfied: docutils>=0.19 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (0.21.2)\n",
+      "Requirement already satisfied: sphinxcontrib-applehelp>=1.0.7 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.0.0)\n",
+      "Requirement already satisfied: sphinxcontrib-devhelp>=1.0.6 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.0.0)\n",
+      "Requirement already satisfied: sphinxcontrib-htmlhelp>=2.0.6 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.1.0)\n",
+      "Requirement already satisfied: sphinxcontrib-jsmath>=1.0.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (1.0.1)\n",
+      "Requirement already satisfied: sphinxcontrib-qthelp>=1.0.6 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.0.0)\n",
+      "Requirement already satisfied: sphinxcontrib-serializinghtml>=1.1.9 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.0.0)\n",
+      "Requirement already satisfied: Jinja2>=3.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.1.6)\n",
+      "Requirement already satisfied: Pygments>=2.17 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.19.2)\n",
+      "Requirement already satisfied: snowballstemmer>=2.2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.0.1)\n",
+      "Requirement already satisfied: babel>=2.13 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.17.0)\n",
+      "Requirement already satisfied: alabaster>=0.7.14 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (1.0.0)\n",
+      "Requirement already satisfied: imagesize>=1.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (1.4.1)\n",
+      "Requirement already satisfied: requests>=2.30.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.32.5)\n",
+      "Requirement already satisfied: roman-numerals-py>=1.0.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.1.0)\n",
+      "Requirement already satisfied: packaging>=23.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (24.2)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from Jinja2>=3.1->sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.0.2)\n",
+      "Requirement already satisfied: charset_normalizer<4,>=2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.30.0->sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.4.3)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.30.0->sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.30.0->sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.5.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.30.0->sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2025.8.3)\n",
+      "Requirement already satisfied: sphinxcontrib-jquery<5,>=4 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx-rtd-theme->indic-nlp-library-itt->indictranstoolkit==1.1.1) (4.1)\n",
+      "Requirement already satisfied: filelock in d:\\major project\\chathur\\.venv\\lib\\site-packages (from transformers->indictranstoolkit==1.1.1) (3.19.1)\n",
+      "Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from transformers->indictranstoolkit==1.1.1) (0.34.4)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from transformers->indictranstoolkit==1.1.1) (6.0.2)\n",
+      "Requirement already satisfied: tokenizers<0.22,>=0.21 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from transformers->indictranstoolkit==1.1.1) (0.21.4)\n",
+      "Requirement already satisfied: safetensors>=0.4.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from transformers->indictranstoolkit==1.1.1) (0.6.2)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from huggingface-hub<1.0,>=0.34.0->transformers->indictranstoolkit==1.1.1) (2025.3.0)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from huggingface-hub<1.0,>=0.34.0->transformers->indictranstoolkit==1.1.1) (4.15.0)\n",
+      "Building wheels for collected packages: indictranstoolkit\n",
+      "  Building editable for indictranstoolkit (pyproject.toml): started\n",
+      "  Building editable for indictranstoolkit (pyproject.toml): finished with status 'error'\n",
+      "Failed to build indictranstoolkit\n",
+      "d:\\Major Project\\Chathur\\Bakend_HuggingFace\\api\\rag\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  error: subprocess-exited-with-error\n",
+      "  \n",
+      "  × Building editable for indictranstoolkit (pyproject.toml) did not run successfully.\n",
+      "  │ exit code: 1\n",
+      "  ╰─> [69 lines of output]\n",
+      "      C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-build-env-fz7kfyri\\overlay\\Lib\\site-packages\\setuptools\\config\\_apply_pyprojecttoml.py:82: SetuptoolsDeprecationWarning: `project.license` as a TOML table is deprecated\n",
+      "      !!\n",
+      "      \n",
+      "              ********************************************************************************\n",
+      "              Please use a simple string containing a SPDX expression for `project.license`. You can also use `project.license-files`. (Both options available on setuptools>=77.0.0).\n",
+      "      \n",
+      "              By 2026-Feb-18, you need to update your project and remove deprecated calls\n",
+      "              or your builds will no longer be supported.\n",
+      "      \n",
+      "              See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.\n",
+      "              ********************************************************************************\n",
+      "      \n",
+      "      !!\n",
+      "        corresp(dist, value, root_dir)\n",
+      "      C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-build-env-fz7kfyri\\overlay\\Lib\\site-packages\\setuptools\\config\\_apply_pyprojecttoml.py:61: SetuptoolsDeprecationWarning: License classifiers are deprecated.\n",
+      "      !!\n",
+      "      \n",
+      "              ********************************************************************************\n",
+      "              Please consider removing the following classifiers in favor of a SPDX license expression:\n",
+      "      \n",
+      "              License :: OSI Approved :: MIT License\n",
+      "      \n",
+      "              See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.\n",
+      "              ********************************************************************************\n",
+      "      \n",
+      "      !!\n",
+      "        dist._finalize_license_expression()\n",
+      "      C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-build-env-fz7kfyri\\overlay\\Lib\\site-packages\\setuptools\\dist.py:759: SetuptoolsDeprecationWarning: License classifiers are deprecated.\n",
+      "      !!\n",
+      "      \n",
+      "              ********************************************************************************\n",
+      "              Please consider removing the following classifiers in favor of a SPDX license expression:\n",
+      "      \n",
+      "              License :: OSI Approved :: MIT License\n",
+      "      \n",
+      "              See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.\n",
+      "              ********************************************************************************\n",
+      "      \n",
+      "      !!\n",
+      "        self._finalize_license_expression()\n",
+      "      running editable_wheel\n",
+      "      creating C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\n",
+      "      writing C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\PKG-INFO\n",
+      "      writing dependency_links to C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\dependency_links.txt\n",
+      "      writing requirements to C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\requires.txt\n",
+      "      writing top-level names to C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\top_level.txt\n",
+      "      writing manifest file 'C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\SOURCES.txt'\n",
+      "      reading manifest file 'C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\SOURCES.txt'\n",
+      "      reading manifest template 'MANIFEST.in'\n",
+      "      warning: no files found matching '*.so' under directory 'IndicTransToolkit'\n",
+      "      adding license file 'LICENSE'\n",
+      "      writing manifest file 'C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\SOURCES.txt'\n",
+      "      creating 'C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit-1.1.1.dist-info'\n",
+      "      creating C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit-1.1.1.dist-info\\WHEEL\n",
+      "      running build_py\n",
+      "      running build_ext\n",
+      "      building 'IndicTransToolkit.processor' extension\n",
+      "      creating C:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\n",
+      "      \"C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\BuildTools\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX86\\x64\\cl.exe\" /c /nologo /O2 /W3 /GL /DNDEBUG /MD \"-Id:\\Major Project\\Chathur\\.venv\\include\" -ID:\\SOFTWARE\\Python\\include -ID:\\SOFTWARE\\Python\\Include \"-IC:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\BuildTools\\VC\\Tools\\MSVC\\14.29.30133\\include\" \"-IC:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.19041.0\\ucrt\" \"-IC:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.19041.0\\shared\" \"-IC:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.19041.0\\um\" \"-IC:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.19041.0\\winrt\" \"-IC:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.19041.0\\cppwinrt\" /TcIndicTransToolkit/processor.c /FoC:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\\processor.obj\n",
+      "      processor.c\n",
+      "      IndicTransToolkit/processor.c(7951): warning C4244: '=': conversion from 'Py_ssize_t' to 'int', possible loss of data\n",
+      "      IndicTransToolkit/processor.c(8597): warning C4244: '=': conversion from 'Py_ssize_t' to 'int', possible loss of data\n",
+      "      creating C:\\Users\\Hp\\AppData\\Local\\Temp\\tmp645n8bz_.build-lib\\IndicTransToolkit\n",
+      "      \"C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\BuildTools\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX86\\x64\\link.exe\" /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO \"/LIBPATH:d:\\Major Project\\Chathur\\.venv\\libs\" /LIBPATH:D:\\SOFTWARE\\Python\\libs /LIBPATH:D:\\SOFTWARE\\Python \"/LIBPATH:d:\\Major Project\\Chathur\\.venv\\PCbuild\\amd64\" \"/LIBPATH:C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\BuildTools\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x64\" \"/LIBPATH:C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.19041.0\\ucrt\\x64\" \"/LIBPATH:C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.19041.0\\um\\x64\" /EXPORT:PyInit_processor C:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\\processor.obj /OUT:C:\\Users\\Hp\\AppData\\Local\\Temp\\tmp645n8bz_.build-lib\\IndicTransToolkit\\processor.cp313-win_amd64.pyd /IMPLIB:C:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\\processor.cp313-win_amd64.lib\n",
+      "         Creating library C:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\\processor.cp313-win_amd64.lib and object C:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\\processor.cp313-win_amd64.exp\n",
+      "      Generating code\n",
+      "      Finished generating code\n",
+      "      copying C:\\Users\\Hp\\AppData\\Local\\Temp\\tmp645n8bz_.build-lib\\IndicTransToolkit\\processor.cp313-win_amd64.pyd -> IndicTransToolkit\n",
+      "      error: could not delete 'IndicTransToolkit\\processor.cp313-win_amd64.pyd': Access is denied\n",
+      "      [end of output]\n",
+      "  \n",
+      "  note: This error originates from a subprocess, and is likely not a problem with pip.\n",
+      "  ERROR: Failed building editable for indictranstoolkit\n",
+      "error: failed-wheel-build-for-install\n",
+      "\n",
+      "× Failed to build installable wheels for some pyproject.toml based projects\n",
+      "╰─> indictranstoolkit\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer\n",
+    "!python -c \"import nltk; nltk.download('punkt')\"\n",
+    "!python -m pip install bitsandbytes scipy accelerate datasets\n",
+    "!python -m pip install sentencepiece\n",
+    "\n",
+    "!git clone https://github.com/VarunGumma/IndicTransToolkit.git\n",
+    "%cd IndicTransToolkit\n",
+    "!python -m pip install --editable ./\n",
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "81d64601",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer\n",
+    "from IndicTransToolkit.processor import IndicProcessor\n",
+    "\n",
+    "BATCH_SIZE = 4\n",
+    "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "quantization = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d260bc8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def initialize_model_and_tokenizer(ckpt_dir, quantization):\n",
+    "    if quantization == \"4-bit\":\n",
+    "        qconfig = BitsAndBytesConfig(\n",
+    "            load_in_4bit=True,\n",
+    "            bnb_4bit_use_double_quant=True,\n",
+    "            bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "        )\n",
+    "    elif quantization == \"8-bit\":\n",
+    "        qconfig = BitsAndBytesConfig(\n",
+    "            load_in_8bit=True,\n",
+    "            bnb_8bit_use_double_quant=True,\n",
+    "            bnb_8bit_compute_dtype=torch.bfloat16,\n",
+    "        )\n",
+    "    else:\n",
+    "        qconfig = None\n",
+    "\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)\n",
+    "    model = AutoModelForSeq2SeqLM.from_pretrained(\n",
+    "        ckpt_dir,\n",
+    "        trust_remote_code=True,\n",
+    "        low_cpu_mem_usage=True,\n",
+    "        quantization_config=qconfig,\n",
+    "    )\n",
+    "\n",
+    "    if qconfig == None:\n",
+    "        model = model.to(DEVICE)\n",
+    "        if DEVICE == \"cuda\":\n",
+    "            model.half()\n",
+    "\n",
+    "    model.eval()\n",
+    "\n",
+    "    return tokenizer, model\n",
+    "\n",
+    "\n",
+    "def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):\n",
+    "    translations = []\n",
+    "    for i in range(0, len(input_sentences), BATCH_SIZE):\n",
+    "        batch = input_sentences[i : i + BATCH_SIZE]\n",
+    "\n",
+    "        # Preprocess the batch and extract entity mappings\n",
+    "        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)\n",
+    "\n",
+    "        # Tokenize the batch and generate input encodings\n",
+    "        inputs = tokenizer(\n",
+    "            batch,\n",
+    "            truncation=True,\n",
+    "            padding=\"longest\",\n",
+    "            return_tensors=\"pt\",\n",
+    "            return_attention_mask=True,\n",
+    "        ).to(DEVICE)\n",
+    "\n",
+    "        # Generate translations using the model\n",
+    "        with torch.no_grad():\n",
+    "            generated_tokens = model.generate(\n",
+    "                **inputs,\n",
+    "                use_cache=True,\n",
+    "                min_length=0,\n",
+    "                max_length=256,\n",
+    "                num_beams=5,\n",
+    "                num_return_sequences=1,\n",
+    "            )\n",
+    "\n",
+    "        # Decode the generated tokens into text\n",
+    "        generated_tokens = tokenizer.batch_decode(\n",
+    "            generated_tokens,\n",
+    "            skip_special_tokens=True,\n",
+    "            clean_up_tokenization_spaces=True,\n",
+    "        )\n",
+    "\n",
+    "        # Postprocess the translations, including entity replacement\n",
+    "        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)\n",
+    "\n",
+    "        del inputs\n",
+    "        torch.cuda.empty_cache()\n",
+    "\n",
+    "    return translations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "634056be",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AssertionError",
+     "evalue": "Invalid source language tag: <hin_Deva>",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mAssertionError\u001b[39m                            Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 33\u001b[39m\n\u001b[32m     28\u001b[39m \u001b[38;5;66;03m# Example\u001b[39;00m\n\u001b[32m     29\u001b[39m en_sents = [\n\u001b[32m     30\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mWhen I was young, I used to go to the park every day.\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     31\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mHe has many old books, which he inherited from his ancestors.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m     32\u001b[39m ]\n\u001b[32m---> \u001b[39m\u001b[32m33\u001b[39m translations = \u001b[43mbatch_translate\u001b[49m\u001b[43m(\u001b[49m\u001b[43men_sents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43meng_Latn\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mhin_Deva\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m     35\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m src, tgt \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(en_sents, translations):\n\u001b[32m     36\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msrc\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m  -->  \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtgt\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 19\u001b[39m, in \u001b[36mbatch_translate\u001b[39m\u001b[34m(sentences, src_lang, tgt_lang)\u001b[39m\n\u001b[32m     16\u001b[39m tagged_sentences = [\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m<\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtgt_lang\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m> \u001b[39m\u001b[38;5;132;01m{\u001b[39;00ms\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m sentences]\n\u001b[32m     18\u001b[39m \u001b[38;5;66;03m# Tokenize\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m19\u001b[39m inputs = \u001b[43mtokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtagged_sentences\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mpt\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpadding\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m.to(DEVICE)\n\u001b[32m     21\u001b[39m \u001b[38;5;66;03m# Generate translations\u001b[39;00m\n\u001b[32m     22\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:2910\u001b[39m, in \u001b[36mPreTrainedTokenizerBase.__call__\u001b[39m\u001b[34m(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[39m\n\u001b[32m   2908\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._in_target_context_manager:\n\u001b[32m   2909\u001b[39m         \u001b[38;5;28mself\u001b[39m._switch_to_input_mode()\n\u001b[32m-> \u001b[39m\u001b[32m2910\u001b[39m     encodings = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_one\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mall_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   2911\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m text_target \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m   2912\u001b[39m     \u001b[38;5;28mself\u001b[39m._switch_to_target_mode()\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:2998\u001b[39m, in \u001b[36mPreTrainedTokenizerBase._call_one\u001b[39m\u001b[34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)\u001b[39m\n\u001b[32m   2993\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m   2994\u001b[39m             \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mbatch length of `text`: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(text)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m does not match batch length of `text_pair`:\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   2995\u001b[39m             \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(text_pair)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   2996\u001b[39m         )\n\u001b[32m   2997\u001b[39m     batch_text_or_text_pairs = \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mzip\u001b[39m(text, text_pair)) \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m text\n\u001b[32m-> \u001b[39m\u001b[32m2998\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mbatch_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   2999\u001b[39m \u001b[43m        \u001b[49m\u001b[43mbatch_text_or_text_pairs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbatch_text_or_text_pairs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3000\u001b[39m \u001b[43m        \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3001\u001b[39m \u001b[43m        \u001b[49m\u001b[43mpadding\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3002\u001b[39m \u001b[43m        \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtruncation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3003\u001b[39m \u001b[43m        \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3004\u001b[39m \u001b[43m        \u001b[49m\u001b[43mstride\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3005\u001b[39m \u001b[43m        \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m=\u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3006\u001b[39m \u001b[43m        \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3007\u001b[39m \u001b[43m        \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3008\u001b[39m \u001b[43m        \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3009\u001b[39m \u001b[43m        \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3010\u001b[39m \u001b[43m        \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3011\u001b[39m \u001b[43m        \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3012\u001b[39m \u001b[43m        \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3013\u001b[39m \u001b[43m        \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3014\u001b[39m \u001b[43m        \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3015\u001b[39m \u001b[43m        \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m=\u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3016\u001b[39m \u001b[43m        \u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3017\u001b[39m \u001b[43m        \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3018\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   3019\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   3020\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.encode_plus(\n\u001b[32m   3021\u001b[39m         text=text,\n\u001b[32m   3022\u001b[39m         text_pair=text_pair,\n\u001b[32m   (...)\u001b[39m\u001b[32m   3040\u001b[39m         **kwargs,\n\u001b[32m   3041\u001b[39m     )\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:3199\u001b[39m, in \u001b[36mPreTrainedTokenizerBase.batch_encode_plus\u001b[39m\u001b[34m(self, batch_text_or_text_pairs, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)\u001b[39m\n\u001b[32m   3189\u001b[39m \u001b[38;5;66;03m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\u001b[39;00m\n\u001b[32m   3190\u001b[39m padding_strategy, truncation_strategy, max_length, kwargs = \u001b[38;5;28mself\u001b[39m._get_padding_truncation_strategies(\n\u001b[32m   3191\u001b[39m     padding=padding,\n\u001b[32m   3192\u001b[39m     truncation=truncation,\n\u001b[32m   (...)\u001b[39m\u001b[32m   3196\u001b[39m     **kwargs,\n\u001b[32m   3197\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m3199\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_batch_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   3200\u001b[39m \u001b[43m    \u001b[49m\u001b[43mbatch_text_or_text_pairs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbatch_text_or_text_pairs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3201\u001b[39m \u001b[43m    \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3202\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3203\u001b[39m \u001b[43m    \u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3204\u001b[39m \u001b[43m    \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3205\u001b[39m \u001b[43m    \u001b[49m\u001b[43mstride\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3206\u001b[39m \u001b[43m    \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m=\u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3207\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3208\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3209\u001b[39m \u001b[43m    \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3210\u001b[39m \u001b[43m    \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3211\u001b[39m \u001b[43m    \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3212\u001b[39m \u001b[43m    \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3213\u001b[39m \u001b[43m    \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3214\u001b[39m \u001b[43m    \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3215\u001b[39m \u001b[43m    \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3216\u001b[39m \u001b[43m    \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m=\u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3217\u001b[39m \u001b[43m    \u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3218\u001b[39m \u001b[43m    \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   3219\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils.py:887\u001b[39m, in \u001b[36mPreTrainedTokenizer._batch_encode_plus\u001b[39m\u001b[34m(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)\u001b[39m\n\u001b[32m    884\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    885\u001b[39m     ids, pair_ids = ids_or_pair_ids\n\u001b[32m--> \u001b[39m\u001b[32m887\u001b[39m first_ids = \u001b[43mget_input_ids\u001b[49m\u001b[43m(\u001b[49m\u001b[43mids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    888\u001b[39m second_ids = get_input_ids(pair_ids) \u001b[38;5;28;01mif\u001b[39;00m pair_ids \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m    889\u001b[39m input_ids.append((first_ids, second_ids))\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils.py:854\u001b[39m, in \u001b[36mPreTrainedTokenizer._batch_encode_plus.<locals>.get_input_ids\u001b[39m\u001b[34m(text)\u001b[39m\n\u001b[32m    852\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget_input_ids\u001b[39m(text):\n\u001b[32m    853\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text, \u001b[38;5;28mstr\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m854\u001b[39m         tokens = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtokenize\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    855\u001b[39m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.convert_tokens_to_ids(tokens)\n\u001b[32m    856\u001b[39m     \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(text) > \u001b[32m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text[\u001b[32m0\u001b[39m], \u001b[38;5;28mstr\u001b[39m):\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils.py:697\u001b[39m, in \u001b[36mPreTrainedTokenizer.tokenize\u001b[39m\u001b[34m(self, text, **kwargs)\u001b[39m\n\u001b[32m    695\u001b[39m         tokenized_text.append(token)\n\u001b[32m    696\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m697\u001b[39m         tokenized_text.extend(\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_tokenize\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[32m    698\u001b[39m \u001b[38;5;66;03m# [\"This\", \" is\", \" something\", \"<special_token_1>\", \"else\"]\u001b[39;00m\n\u001b[32m    699\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m tokenized_text\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~\\.cache\\huggingface\\modules\\transformers_modules\\ai4bharat\\indictrans2-en-indic-1B\\10e65a9951a1e922cd109a95e8aba9357b62144b\\tokenization_indictrans.py:201\u001b[39m, in \u001b[36mIndicTransTokenizer._src_tokenize\u001b[39m\u001b[34m(self, text)\u001b[39m\n\u001b[32m    199\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_src_tokenize\u001b[39m(\u001b[38;5;28mself\u001b[39m, text: \u001b[38;5;28mstr\u001b[39m) -> List[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[32m    200\u001b[39m     src_lang, tgt_lang, text = text.split(\u001b[33m\"\u001b[39m\u001b[33m \u001b[39m\u001b[33m\"\u001b[39m, \u001b[32m2\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m201\u001b[39m     \u001b[38;5;28;01massert\u001b[39;00m src_lang \u001b[38;5;129;01min\u001b[39;00m LANGUAGE_TAGS, \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mInvalid source language tag: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msrc_lang\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m    202\u001b[39m     \u001b[38;5;28;01massert\u001b[39;00m tgt_lang \u001b[38;5;129;01min\u001b[39;00m LANGUAGE_TAGS, \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mInvalid target language tag: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtgt_lang\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m    203\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m [src_lang, tgt_lang] + \u001b[38;5;28mself\u001b[39m.spm.EncodeAsPieces(text)\n",
+      "\u001b[31mAssertionError\u001b[39m: Invalid source language tag: <hin_Deva>"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
+    "\n",
+    "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "\n",
+    "model_name = \"ai4bharat/indictrans2-en-indic-1B\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    trust_remote_code=True,\n",
+    "    torch_dtype=torch.float32   # safer on CPU/Windows\n",
+    ").to(DEVICE)\n",
+    "\n",
+    "def batch_translate(sentences, src_lang, tgt_lang):\n",
+    "    # Add target language tag to each sentence\n",
+    "    tagged_sentences = [f\"<{tgt_lang}> {s}\" for s in sentences]\n",
+    "\n",
+    "    # Tokenize\n",
+    "    inputs = tokenizer(tagged_sentences, return_tensors=\"pt\", padding=True, truncation=True).to(DEVICE)\n",
+    "\n",
+    "    # Generate translations\n",
+    "    with torch.no_grad():\n",
+    "        outputs = model.generate(**inputs, max_length=256, num_beams=5)\n",
+    "\n",
+    "    # Decode\n",
+    "    return tokenizer.batch_decode(outputs, skip_special_tokens=True)\n",
+    "\n",
+    "# Example\n",
+    "en_sents = [\n",
+    "    \"When I was young, I used to go to the park every day.\",\n",
+    "    \"He has many old books, which he inherited from his ancestors.\"\n",
+    "]\n",
+    "translations = batch_translate(en_sents, \"eng_Latn\", \"hin_Deva\")\n",
+    "\n",
+    "for src, tgt in zip(en_sents, translations):\n",
+    "    print(f\"{src}  -->  {tgt}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6226efc6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "723d9adbe3d04fa0a614614fd9e12402",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "899ca69572054095939b5d5bb30ef0fa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "828492450e724b67bad0bc91d5627377",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0c8a6aa4773a427d98c7daf6807c84cb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e9d7c0c795034dce94b2ab1846ed91bc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "29eb15e6882b41afad2f0a6bb1dbfea3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "OSError",
+     "evalue": "[Errno 28] No space left on device",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mOSError\u001b[39m                                   Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m      4\u001b[39m model_name = \u001b[33m\"\u001b[39m\u001b[33mHelsinki-NLP/opus-mt-en-hi\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m      5\u001b[39m tokenizer = MarianTokenizer.from_pretrained(model_name)\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m model = \u001b[43mMarianMTModel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m)\u001b[49m.to(\u001b[33m\"\u001b[39m\u001b[33mcuda\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m torch.cuda.is_available() \u001b[38;5;28;01melse\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mcpu\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m      8\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mtranslate\u001b[39m(texts):\n\u001b[32m      9\u001b[39m     inputs = tokenizer(texts, return_tensors=\u001b[33m\"\u001b[39m\u001b[33mpt\u001b[39m\u001b[33m\"\u001b[39m, padding=\u001b[38;5;28;01mTrue\u001b[39;00m, truncation=\u001b[38;5;28;01mTrue\u001b[39;00m).to(model.device)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\modeling_utils.py:317\u001b[39m, in \u001b[36mrestore_default_torch_dtype.<locals>._wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    315\u001b[39m old_dtype = torch.get_default_dtype()\n\u001b[32m    316\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m317\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    318\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m    319\u001b[39m     torch.set_default_dtype(old_dtype)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\modeling_utils.py:4923\u001b[39m, in \u001b[36mPreTrainedModel.from_pretrained\u001b[39m\u001b[34m(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)\u001b[39m\n\u001b[32m   4913\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[32m   4914\u001b[39m     gguf_file\n\u001b[32m   4915\u001b[39m     \u001b[38;5;129;01mand\u001b[39;00m device_map \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   4916\u001b[39m     \u001b[38;5;129;01mand\u001b[39;00m ((\u001b[38;5;28misinstance\u001b[39m(device_map, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mdisk\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m device_map.values()) \u001b[38;5;129;01mor\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mdisk\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m device_map)\n\u001b[32m   4917\u001b[39m ):\n\u001b[32m   4918\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[32m   4919\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33mOne or more modules is configured to be mapped to disk. Disk offload is not supported for models \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   4920\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33mloaded from GGUF files.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   4921\u001b[39m     )\n\u001b[32m-> \u001b[39m\u001b[32m4923\u001b[39m checkpoint_files, sharded_metadata = \u001b[43m_get_resolved_checkpoint_files\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   4924\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4925\u001b[39m \u001b[43m    \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m=\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4926\u001b[39m \u001b[43m    \u001b[49m\u001b[43mvariant\u001b[49m\u001b[43m=\u001b[49m\u001b[43mvariant\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4927\u001b[39m \u001b[43m    \u001b[49m\u001b[43mgguf_file\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgguf_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4928\u001b[39m \u001b[43m    \u001b[49m\u001b[43mfrom_tf\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfrom_tf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4929\u001b[39m \u001b[43m    \u001b[49m\u001b[43mfrom_flax\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfrom_flax\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4930\u001b[39m \u001b[43m    \u001b[49m\u001b[43muse_safetensors\u001b[49m\u001b[43m=\u001b[49m\u001b[43muse_safetensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4931\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4932\u001b[39m \u001b[43m    \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4933\u001b[39m \u001b[43m    \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4934\u001b[39m \u001b[43m    \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4935\u001b[39m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4936\u001b[39m \u001b[43m    \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m=\u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4937\u001b[39m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4938\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcommit_hash\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcommit_hash\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4939\u001b[39m \u001b[43m    \u001b[49m\u001b[43mis_remote_code\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_auto_class\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m   4940\u001b[39m \u001b[43m    \u001b[49m\u001b[43mtransformers_explicit_filename\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtransformers_explicit_filename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   4941\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   4943\u001b[39m is_sharded = sharded_metadata \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   4944\u001b[39m is_quantized = hf_quantizer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\modeling_utils.py:1208\u001b[39m, in \u001b[36m_get_resolved_checkpoint_files\u001b[39m\u001b[34m(pretrained_model_name_or_path, subfolder, variant, gguf_file, from_tf, from_flax, use_safetensors, cache_dir, force_download, proxies, local_files_only, token, user_agent, revision, commit_hash, is_remote_code, transformers_explicit_filename)\u001b[39m\n\u001b[32m   1205\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   1206\u001b[39m         \u001b[38;5;66;03m# This repo has no safetensors file of any kind, we switch to PyTorch.\u001b[39;00m\n\u001b[32m   1207\u001b[39m         filename = _add_variant(WEIGHTS_NAME, variant)\n\u001b[32m-> \u001b[39m\u001b[32m1208\u001b[39m         resolved_archive_file = \u001b[43mcached_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1209\u001b[39m \u001b[43m            \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mcached_file_kwargs\u001b[49m\n\u001b[32m   1210\u001b[39m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1211\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m resolved_archive_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m filename == _add_variant(WEIGHTS_NAME, variant):\n\u001b[32m   1212\u001b[39m     \u001b[38;5;66;03m# Maybe the checkpoint is sharded, we try to grab the index name in this case.\u001b[39;00m\n\u001b[32m   1213\u001b[39m     resolved_archive_file = cached_file(\n\u001b[32m   1214\u001b[39m         pretrained_model_name_or_path,\n\u001b[32m   1215\u001b[39m         _add_variant(WEIGHTS_INDEX_NAME, variant),\n\u001b[32m   1216\u001b[39m         **cached_file_kwargs,\n\u001b[32m   1217\u001b[39m     )\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\utils\\hub.py:321\u001b[39m, in \u001b[36mcached_file\u001b[39m\u001b[34m(path_or_repo_id, filename, **kwargs)\u001b[39m\n\u001b[32m    263\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcached_file\u001b[39m(\n\u001b[32m    264\u001b[39m     path_or_repo_id: Union[\u001b[38;5;28mstr\u001b[39m, os.PathLike],\n\u001b[32m    265\u001b[39m     filename: \u001b[38;5;28mstr\u001b[39m,\n\u001b[32m    266\u001b[39m     **kwargs,\n\u001b[32m    267\u001b[39m ) -> Optional[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[32m    268\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m    269\u001b[39m \u001b[33;03m    Tries to locate a file in a local folder and repo, downloads and cache it if necessary.\u001b[39;00m\n\u001b[32m    270\u001b[39m \n\u001b[32m   (...)\u001b[39m\u001b[32m    319\u001b[39m \u001b[33;03m    ```\u001b[39;00m\n\u001b[32m    320\u001b[39m \u001b[33;03m    \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m321\u001b[39m     file = \u001b[43mcached_files\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilenames\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    322\u001b[39m     file = file[\u001b[32m0\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m file\n\u001b[32m    323\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m file\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\utils\\hub.py:567\u001b[39m, in \u001b[36mcached_files\u001b[39m\u001b[34m(path_or_repo_id, filenames, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[39m\n\u001b[32m    564\u001b[39m     \u001b[38;5;66;03m# Any other Exception type should now be re-raised, in order to provide helpful error messages and break the execution flow\u001b[39;00m\n\u001b[32m    565\u001b[39m     \u001b[38;5;66;03m# (EntryNotFoundError will be treated outside this block and correctly re-raised if needed)\u001b[39;00m\n\u001b[32m    566\u001b[39m     \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e, EntryNotFoundError):\n\u001b[32m--> \u001b[39m\u001b[32m567\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[32m    569\u001b[39m resolved_files = [\n\u001b[32m    570\u001b[39m     _get_cache_file_to_return(path_or_repo_id, filename, cache_dir, revision) \u001b[38;5;28;01mfor\u001b[39;00m filename \u001b[38;5;129;01min\u001b[39;00m full_filenames\n\u001b[32m    571\u001b[39m ]\n\u001b[32m    572\u001b[39m \u001b[38;5;66;03m# If there are any missing file and the flag is active, raise\u001b[39;00m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\utils\\hub.py:479\u001b[39m, in \u001b[36mcached_files\u001b[39m\u001b[34m(path_or_repo_id, filenames, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[39m\n\u001b[32m    476\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m    477\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(full_filenames) == \u001b[32m1\u001b[39m:\n\u001b[32m    478\u001b[39m         \u001b[38;5;66;03m# This is slightly better for only 1 file\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m479\u001b[39m         \u001b[43mhf_hub_download\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    480\u001b[39m \u001b[43m            \u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    481\u001b[39m \u001b[43m            \u001b[49m\u001b[43mfilenames\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    482\u001b[39m \u001b[43m            \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m==\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    483\u001b[39m \u001b[43m            \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    484\u001b[39m \u001b[43m            \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    485\u001b[39m \u001b[43m            \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    486\u001b[39m \u001b[43m            \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m=\u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    487\u001b[39m \u001b[43m            \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    488\u001b[39m \u001b[43m            \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    489\u001b[39m \u001b[43m            \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    490\u001b[39m \u001b[43m            \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    491\u001b[39m \u001b[43m            \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    492\u001b[39m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    493\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    494\u001b[39m         snapshot_download(\n\u001b[32m    495\u001b[39m             path_or_repo_id,\n\u001b[32m    496\u001b[39m             allow_patterns=full_filenames,\n\u001b[32m   (...)\u001b[39m\u001b[32m    505\u001b[39m             local_files_only=local_files_only,\n\u001b[32m    506\u001b[39m         )\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\huggingface_hub\\utils\\_validators.py:114\u001b[39m, in \u001b[36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    111\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[32m    112\u001b[39m     kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.\u001b[34m__name__\u001b[39m, has_token=has_token, kwargs=kwargs)\n\u001b[32m--> \u001b[39m\u001b[32m114\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:1010\u001b[39m, in \u001b[36mhf_hub_download\u001b[39m\u001b[34m(repo_id, filename, subfolder, repo_type, revision, library_name, library_version, cache_dir, local_dir, user_agent, force_download, proxies, etag_timeout, token, local_files_only, headers, endpoint, resume_download, force_filename, local_dir_use_symlinks)\u001b[39m\n\u001b[32m    990\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m _hf_hub_download_to_local_dir(\n\u001b[32m    991\u001b[39m         \u001b[38;5;66;03m# Destination\u001b[39;00m\n\u001b[32m    992\u001b[39m         local_dir=local_dir,\n\u001b[32m   (...)\u001b[39m\u001b[32m   1007\u001b[39m         local_files_only=local_files_only,\n\u001b[32m   1008\u001b[39m     )\n\u001b[32m   1009\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1010\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_hf_hub_download_to_cache_dir\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1011\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;66;43;03m# Destination\u001b[39;49;00m\n\u001b[32m   1012\u001b[39m \u001b[43m        \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1013\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;66;43;03m# File info\u001b[39;49;00m\n\u001b[32m   1014\u001b[39m \u001b[43m        \u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1015\u001b[39m \u001b[43m        \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1016\u001b[39m \u001b[43m        \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1017\u001b[39m \u001b[43m        \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1018\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;66;43;03m# HTTP info\u001b[39;49;00m\n\u001b[32m   1019\u001b[39m \u001b[43m        \u001b[49m\u001b[43mendpoint\u001b[49m\u001b[43m=\u001b[49m\u001b[43mendpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1020\u001b[39m \u001b[43m        \u001b[49m\u001b[43metag_timeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43metag_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1021\u001b[39m \u001b[43m        \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhf_headers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1022\u001b[39m \u001b[43m        \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1023\u001b[39m \u001b[43m        \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1024\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;66;43;03m# Additional options\u001b[39;49;00m\n\u001b[32m   1025\u001b[39m \u001b[43m        \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1026\u001b[39m \u001b[43m        \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1027\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:1171\u001b[39m, in \u001b[36m_hf_hub_download_to_cache_dir\u001b[39m\u001b[34m(cache_dir, repo_id, filename, repo_type, revision, endpoint, etag_timeout, headers, proxies, token, local_files_only, force_download)\u001b[39m\n\u001b[32m   1168\u001b[39m \u001b[38;5;66;03m# Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)\u001b[39;00m\n\u001b[32m   1170\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m WeakFileLock(lock_path):\n\u001b[32m-> \u001b[39m\u001b[32m1171\u001b[39m     \u001b[43m_download_to_tmp_and_move\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1172\u001b[39m \u001b[43m        \u001b[49m\u001b[43mincomplete_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblob_path\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m.incomplete\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1173\u001b[39m \u001b[43m        \u001b[49m\u001b[43mdestination_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblob_path\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1174\u001b[39m \u001b[43m        \u001b[49m\u001b[43murl_to_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43murl_to_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1175\u001b[39m \u001b[43m        \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1176\u001b[39m \u001b[43m        \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1177\u001b[39m \u001b[43m        \u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1178\u001b[39m \u001b[43m        \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1179\u001b[39m \u001b[43m        \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1180\u001b[39m \u001b[43m        \u001b[49m\u001b[43metag\u001b[49m\u001b[43m=\u001b[49m\u001b[43metag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1181\u001b[39m \u001b[43m        \u001b[49m\u001b[43mxet_file_data\u001b[49m\u001b[43m=\u001b[49m\u001b[43mxet_file_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1182\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1183\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m os.path.exists(pointer_path):\n\u001b[32m   1184\u001b[39m         _create_symlink(blob_path, pointer_path, new_blob=\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:1738\u001b[39m, in \u001b[36m_download_to_tmp_and_move\u001b[39m\u001b[34m(incomplete_path, destination_path, url_to_download, proxies, headers, expected_size, filename, force_download, etag, xet_file_data)\u001b[39m\n\u001b[32m   1731\u001b[39m         \u001b[38;5;28;01mif\u001b[39;00m xet_file_data \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m constants.HF_HUB_DISABLE_XET:\n\u001b[32m   1732\u001b[39m             logger.warning(\n\u001b[32m   1733\u001b[39m                 \u001b[33m\"\u001b[39m\u001b[33mXet Storage is enabled for this repo, but the \u001b[39m\u001b[33m'\u001b[39m\u001b[33mhf_xet\u001b[39m\u001b[33m'\u001b[39m\u001b[33m package is not installed. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   1734\u001b[39m                 \u001b[33m\"\u001b[39m\u001b[33mFalling back to regular HTTP download. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   1735\u001b[39m                 \u001b[33m\"\u001b[39m\u001b[33mFor better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   1736\u001b[39m             )\n\u001b[32m-> \u001b[39m\u001b[32m1738\u001b[39m         \u001b[43mhttp_get\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1739\u001b[39m \u001b[43m            \u001b[49m\u001b[43murl_to_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1740\u001b[39m \u001b[43m            \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1741\u001b[39m \u001b[43m            \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1742\u001b[39m \u001b[43m            \u001b[49m\u001b[43mresume_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mresume_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1743\u001b[39m \u001b[43m            \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1744\u001b[39m \u001b[43m            \u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1745\u001b[39m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1747\u001b[39m logger.info(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mDownload complete. Moving file to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdestination_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m   1748\u001b[39m _chmod_and_move(incomplete_path, destination_path)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:499\u001b[39m, in \u001b[36mhttp_get\u001b[39m\u001b[34m(url, temp_file, proxies, resume_size, headers, expected_size, displayed_filename, _nb_retries, _tqdm_bar)\u001b[39m\n\u001b[32m    497\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m chunk:  \u001b[38;5;66;03m# filter out keep-alive new chunks\u001b[39;00m\n\u001b[32m    498\u001b[39m     progress.update(\u001b[38;5;28mlen\u001b[39m(chunk))\n\u001b[32m--> \u001b[39m\u001b[32m499\u001b[39m     \u001b[43mtemp_file\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    500\u001b[39m     new_resume_size += \u001b[38;5;28mlen\u001b[39m(chunk)\n\u001b[32m    501\u001b[39m     \u001b[38;5;66;03m# Some data has been downloaded from the server so we reset the number of retries.\u001b[39;00m\n",
+      "\u001b[31mOSError\u001b[39m: [Errno 28] No space left on device"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import MarianMTModel, MarianTokenizer\n",
+    "import torch\n",
+    "\n",
+    "model_name = \"Helsinki-NLP/opus-mt-en-hi\"\n",
+    "tokenizer = MarianTokenizer.from_pretrained(model_name)\n",
+    "model = MarianMTModel.from_pretrained(model_name).to(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "\n",
+    "def translate(texts):\n",
+    "    inputs = tokenizer(texts, return_tensors=\"pt\", padding=True, truncation=True).to(model.device)\n",
+    "    translated = model.generate(**inputs, max_length=256)\n",
+    "    return tokenizer.batch_decode(translated, skip_special_tokens=True)\n",
+    "\n",
+    "sentences = [\n",
+    "    \"I love Indian food.\",\n",
+    "    \"My friend is visiting Delhi tomorrow.\",\n",
+    "    \"The weather is very pleasant today.\"\n",
+    "]\n",
+    "print(translate(sentences))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0166a4f1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Files removed: 0 (0 bytes)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING: No matching packages\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip cache purge\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9218bcd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

api/rag/translator.ipynb ADDED Viewed

	@@ -0,0 +1,524 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1243db1a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade torch --index-url https://download.pytorch.org/whl/cu121 --quiet\n",
+    "!pip install --upgrade transformers datasets sentencepiece sacrebleu evaluate --quiet\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "03d54e16",
+   "metadata": {},
+   "source": [
+    "### Install dependencies and upgrade PyTorch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1593c50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"WANDB_DISABLED\"] = \"true\"  # Disable WandB logging completely\n",
+    "\n",
+    "import torch\n",
+    "from datasets import load_dataset, concatenate_datasets\n",
+    "import evaluate\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec6c67e5",
+   "metadata": {},
+   "source": [
+    "### 1. Set device for GPU if available"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3aeb9062",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "print(\"Using device:\", device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8fe4ab2b",
+   "metadata": {},
+   "source": [
+    "### 2. Load English–Kannada and English–Hindi datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a2d41a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Loading datasets...\")\n",
+    "dataset_kn = load_dataset(\"ai4bharat/samanantar\", \"kn\", split=\"train\")\n",
+    "dataset_hi = load_dataset(\"ai4bharat/samanantar\", \"hi\", split=\"train\")\n",
+    "\n",
+    "print(\"English–Kannada sample:\", dataset_kn[0])\n",
+    "print(\"English–Hindi sample:\", dataset_hi[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dac2a390",
+   "metadata": {},
+   "source": [
+    "### 3. Reduce dataset size for faster local training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43ab2de9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_samples = 50000  # Adjust this number based on your system performance\n",
+    "if len(dataset_kn) > max_samples:\n",
+    "    dataset_kn = dataset_kn.select(range(max_samples))\n",
+    "if len(dataset_hi) > max_samples:\n",
+    "    dataset_hi = dataset_hi.select(range(max_samples))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c1e651d",
+   "metadata": {},
+   "source": [
+    "### 4. Merge datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ac937ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = concatenate_datasets([dataset_kn, dataset_hi])\n",
+    "print(\"Combined dataset size:\", len(dataset))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c3252bf",
+   "metadata": {},
+   "source": [
+    "### 5. Load tokenizer and model (mT5-small with safetensors)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebd71460",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "model_checkpoint = \"google/mt5-small\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, trust_remote_code=True, use_safetensors=True)\n",
+    "model.to(device)  # Move model to GPU if available\n",
+    "\n",
+    "# ================================================\n",
+    "# 6. Preprocess data\n",
+    "# ================================================\n",
+    "max_len = 128\n",
+    "\n",
+    "def preprocess_function(examples):\n",
+    "    inputs = examples[\"src\"]   # English text\n",
+    "    targets = examples[\"tgt\"]  # Kannada or Hindi text\n",
+    "    model_inputs = tokenizer(inputs, truncation=True, padding=\"max_length\", max_length=max_len)\n",
+    "    with tokenizer.as_target_tokenizer():\n",
+    "        labels = tokenizer(targets, truncation=True, padding=\"max_length\", max_length=max_len)\n",
+    "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
+    "    return model_inputs\n",
+    "\n",
+    "print(\"Tokenizing dataset...\")\n",
+    "tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"idx\", \"src\", \"tgt\"])\n",
+    "\n",
+    "# ================================================\n",
+    "# 7. Training setup\n",
+    "# ================================================\n",
+    "metric = evaluate.load(\"sacrebleu\")\n",
+    "\n",
+    "def compute_metrics(eval_pred):\n",
+    "    preds, labels = eval_pred\n",
+    "    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
+    "    labels = [[tokenizer.decode(l, skip_special_tokens=True)] for l in labels]\n",
+    "    result = metric.compute(predictions=decoded_preds, references=labels)\n",
+    "    result[\"bleu\"] = result[\"score\"]\n",
+    "    return result\n",
+    "\n",
+    "training_args = Seq2SeqTrainingArguments(\n",
+    "    output_dir=\"./translator-model\",\n",
+    "    do_eval=True,\n",
+    "    per_device_train_batch_size=8,\n",
+    "    per_device_eval_batch_size=8,\n",
+    "    learning_rate=5e-5,\n",
+    "    num_train_epochs=2,\n",
+    "    weight_decay=0.01,\n",
+    "    save_total_limit=2,\n",
+    "    predict_with_generate=True,\n",
+    "    logging_steps=200,\n",
+    "    save_steps=1000,\n",
+    "    report_to=\"none\"  # no W&B logging\n",
+    ")\n",
+    "\n",
+    "trainer = Seq2SeqTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_dataset,\n",
+    "    eval_dataset=tokenized_dataset.select(range(1000)),  # small eval set\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7d85b60",
+   "metadata": {},
+   "source": [
+    "### 8. Train model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d031c154",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Starting training...\")\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4679d55b",
+   "metadata": {},
+   "source": [
+    "### 9. Evaluate model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d484587",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Running evaluation...\")\n",
+    "results = trainer.evaluate()\n",
+    "print(\"Evaluation BLEU score:\", results.get(\"bleu\", results))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c39f5ed2",
+   "metadata": {},
+   "source": [
+    "### 10. Save final model locally"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e5f4517",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.save_model(\"./final-translator\")\n",
+    "tokenizer.save_pretrained(\"./final-translator\")\n",
+    "print(\"Model saved in ./final-translator\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7d019483",
+   "metadata": {},
+   "source": [
+    "### 11. Test translation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "67c28c80",
+   "metadata": {},
+   "source": [
+    "test_sentence = \"How are you?\"\n",
+    "inputs = tokenizer(test_sentence, return_tensors=\"pt\", padding=True).to(device)\n",
+    "outputs = model.generate(**inputs, max_length=50)\n",
+    "print(\"Input:\", test_sentence)\n",
+    "print(\"Translated:\", tokenizer.decode(outputs[0], skip_special_tokens=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d5daee8c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'torch'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[1], line 11\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;66;03m# ==========================================================\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;66;03m# 0. Install dependencies (run in terminal once)\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;66;03m# ==========================================================\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m      8\u001b[0m \u001b[38;5;66;03m# 1. Imports and setup\u001b[39;00m\n\u001b[0;32m      9\u001b[0m \u001b[38;5;66;03m# ==========================================================\u001b[39;00m\n\u001b[0;32m     10\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m---> 11\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m     12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m load_dataset, concatenate_datasets\n\u001b[0;32m     13\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mevaluate\u001b[39;00m\n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'torch'"
+     ]
+    }
+   ],
+   "source": [
+    "# ==========================================================\n",
+    "# 0. Install dependencies (run in terminal once)\n",
+    "# ==========================================================\n",
+    "# pip install torch==2.5.1+cu121 torchvision==0.12.1+cu121 torchaudio==2.5.1+cu121 --index-url https://download.pytorch.org/whl/cu121\n",
+    "# pip install --upgrade transformers datasets sentencepiece sacrebleu evaluate peft --quiet\n",
+    "\n",
+    "# ==========================================================\n",
+    "# 1. Imports and setup\n",
+    "# ==========================================================\n",
+    "import os\n",
+    "import torch\n",
+    "from datasets import load_dataset, concatenate_datasets\n",
+    "import evaluate\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer\n",
+    "\n",
+    "# PEFT for LoRA\n",
+    "from peft import LoraConfig, get_peft_model\n",
+    "\n",
+    "# Disable wandb\n",
+    "os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
+    "\n",
+    "# Use GPU if available\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "print(\"Using device:\", device)\n",
+    "\n",
+    "# ==========================================================\n",
+    "# 2. Load datasets\n",
+    "# ==========================================================\n",
+    "print(\"Loading datasets...\")\n",
+    "dataset_kn = load_dataset(\"ai4bharat/samanantar\", \"kn\", split=\"train\")\n",
+    "dataset_hi = load_dataset(\"ai4bharat/samanantar\", \"hi\", split=\"train\")\n",
+    "\n",
+    "# Optional: reduce dataset size for quick local training\n",
+    "max_samples = 5000  # adjust depending on your GPU memory\n",
+    "dataset_kn = dataset_kn.select(range(min(len(dataset_kn), max_samples)))\n",
+    "dataset_hi = dataset_hi.select(range(min(len(dataset_hi), max_samples)))\n",
+    "\n",
+    "# Merge datasets\n",
+    "dataset = concatenate_datasets([dataset_kn, dataset_hi])\n",
+    "print(\"Combined dataset size:\", len(dataset))\n",
+    "\n",
+    "# ==========================================================\n",
+    "# 3. Load tokenizer and model (mt5-tiny for low VRAM)\n",
+    "# ==========================================================\n",
+    "model_checkpoint = \"google/mt5-tiny\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, use_safetensors=True)\n",
+    "model.to(device)\n",
+    "\n",
+    "# ==========================================================\n",
+    "# 4. LoRA config (parameter-efficient fine-tuning)\n",
+    "# ==========================================================\n",
+    "lora_config = LoraConfig(\n",
+    "    r=8,\n",
+    "    lora_alpha=16,\n",
+    "    target_modules=[\"q\", \"v\"],  # applies LoRA to Q and V matrices\n",
+    "    lora_dropout=0.05,\n",
+    "    bias=\"none\",\n",
+    "    task_type=\"SEQ_2_SEQ_LM\"\n",
+    ")\n",
+    "model = get_peft_model(model, lora_config)\n",
+    "print(\"LoRA applied for low-memory fine-tuning.\")\n",
+    "\n",
+    "# ==========================================================\n",
+    "# 5. Preprocessing\n",
+    "# ==========================================================\n",
+    "max_len = 128\n",
+    "\n",
+    "def preprocess_function(examples):\n",
+    "    inputs = examples[\"src\"]   # English\n",
+    "    targets = examples[\"tgt\"]  # Kannada or Hindi\n",
+    "    model_inputs = tokenizer(inputs, truncation=True, padding=\"max_length\", max_length=max_len)\n",
+    "    with tokenizer.as_target_tokenizer():\n",
+    "        labels = tokenizer(targets, truncation=True, padding=\"max_length\", max_length=max_len)\n",
+    "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
+    "    return model_inputs\n",
+    "\n",
+    "print(\"Tokenizing dataset...\")\n",
+    "tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"idx\", \"src\", \"tgt\"])\n",
+    "\n",
+    "# ==========================================================\n",
+    "# 6. Evaluation metric\n",
+    "# ==========================================================\n",
+    "metric = evaluate.load(\"sacrebleu\")\n",
+    "\n",
+    "def compute_metrics(eval_pred):\n",
+    "    preds, labels = eval_pred\n",
+    "    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
+    "    labels = [[tokenizer.decode(l, skip_special_tokens=True)] for l in labels]\n",
+    "    result = metric.compute(predictions=decoded_preds, references=labels)\n",
+    "    result[\"bleu\"] = result[\"score\"]\n",
+    "    return result\n",
+    "\n",
+    "# ==========================================================\n",
+    "# 7. Training setup\n",
+    "# ==========================================================\n",
+    "training_args = Seq2SeqTrainingArguments(\n",
+    "    output_dir=\"./translator-model\",\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    per_device_train_batch_size=4,  # reduce if out-of-memory\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    learning_rate=5e-5,\n",
+    "    num_train_epochs=2,\n",
+    "    weight_decay=0.01,\n",
+    "    save_total_limit=2,\n",
+    "    predict_with_generate=True,\n",
+    "    logging_steps=50,\n",
+    "    save_steps=500,\n",
+    "    report_to=\"none\"\n",
+    ")\n",
+    "\n",
+    "trainer = Seq2SeqTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_dataset,\n",
+    "    eval_dataset=tokenized_dataset.select(range(min(500, len(tokenized_dataset)))),  # small eval\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "\n",
+    "# ==========================================================\n",
+    "# 11. Test translation\n",
+    "# ==========================================================\n",
+    "test_sentence = \"How are you?\"\n",
+    "inputs = tokenizer(test_sentence, return_tensors=\"pt\", padding=True).to(device)\n",
+    "outputs = model.generate(**inputs, max_length=50)\n",
+    "print(\"Input:\", test_sentence)\n",
+    "print(\"Translated:\", tokenizer.decode(outputs[0], skip_special_tokens=True))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50378dce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ==========================================================\n",
+    "# 8. Train the model\n",
+    "# ==========================================================\n",
+    "print(\"Starting training...\")\n",
+    "trainer.train()\n",
+    "\n",
+    "# ==========================================================\n",
+    "# 9. Evaluate\n",
+    "# ==========================================================\n",
+    "print(\"Running evaluation...\")\n",
+    "results = trainer.evaluate()\n",
+    "print(\"Evaluation BLEU score:\", results.get(\"bleu\", results))\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b11c06e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ==========================================================\n",
+    "# 10. Save final model\n",
+    "# ==========================================================\n",
+    "trainer.save_model(\"./final-translator\")\n",
+    "tokenizer.save_pretrained(\"./final-translator\")\n",
+    "print(\"Model saved in ./final-translator\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "5112bac7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://download.pytorch.org/whl/cu121\n",
+      "Collecting torch==2.5.1+cu121\n",
+      "  Using cached https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp39-cp39-win_amd64.whl (2449.3 MB)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "ERROR: Could not find a version that satisfies the requirement torchvision==0.12.1+cu121 (from versions: 0.1.6, 0.2.0, 0.16.0+cu121, 0.16.1+cu121, 0.16.2+cu121, 0.17.0+cu121, 0.17.1+cu121, 0.17.2+cu121, 0.18.0+cu121, 0.18.1+cu121, 0.19.0+cu121, 0.19.1+cu121, 0.20.0+cu121, 0.20.1+cu121)\n",
+      "ERROR: No matching distribution found for torchvision==0.12.1+cu121\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install torch==2.5.1+cu121 torchvision==0.12.1+cu121 torchaudio==2.5.1+cu121 --index-url https://download.pytorch.org/whl/cu121\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37e4d148",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "chathur",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

api/routes/endpoints.py CHANGED Viewed

@@ -1,12 +1,21 @@
-from fastapi import APIRouter, HTTPException, Query, status
 import urllib.parse
-from api.services.scheme_service import get_all_schemes, get_schemes_by_state, get_scheme_details_by_title, search_schemes_in_cache, get_cache_loading_status # Changed import paths
-# from vector_ops import search_scheme # Assuming vector_ops is still a top-level file or imported correctly
 router = APIRouter()
-@router.get("/schemes", summary="Get all schemes grouped by state")
-def get_all_schemes_grouped_by_state_endpoint():
     """
     Returns all schemes grouped by state from the in-memory cache.
     """
@@ -16,14 +25,17 @@ def get_all_schemes_grouped_by_state_endpoint():
             detail="Schemes cache is currently loading. Please try again shortly."
         )
-    schemes = get_all_schemes()
     if not schemes:
         raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="No schemes found in cache.")
     return schemes
-@router.get("/schemes/{state}", summary="Get schemes for a specific state")
-def get_scheme_titles_by_state_endpoint(state: str):
     """
     Returns all schemes for a specific state from the in-memory cache.
     """
@@ -33,7 +45,7 @@ def get_scheme_titles_by_state_endpoint(state: str):
             detail="Schemes cache is currently loading. Please try again shortly."
         )
-    schemes_for_state = get_schemes_by_state(state)
     if not schemes_for_state:
         raise HTTPException(
             status_code=status.HTTP_404_NOT_FOUND,
@@ -46,8 +58,12 @@ def get_scheme_titles_by_state_endpoint(state: str):
     }
-@router.get("/schemes/{state}/scheme_titles/{title}", summary="Get details for a single scheme by title")
-def get_scheme_details_endpoint(state: str, title: str):
     """
     Returns details for a single scheme by title within a specific state from the in-memory cache.
     """
@@ -58,7 +74,7 @@ def get_scheme_details_endpoint(state: str, title: str):
         )
     decoded_title = urllib.parse.unquote(title)
-    scheme_details = get_scheme_details_by_title(state, decoded_title)
     if not scheme_details:
         raise HTTPException(
@@ -68,8 +84,11 @@ def get_scheme_details_endpoint(state: str, title: str):
     return scheme_details
-@router.get("/searchscheme", summary="Search schemes by keyword across all states")
-def search_schemes_endpoint(query: str = Query(..., description="Search across all schemes")):
     """
     Searches schemes across all states using the in-memory cache for smooth performance.
     """
@@ -79,22 +98,14 @@ def search_schemes_endpoint(query: str = Query(..., description="Search across a
             detail="Schemes cache is currently loading. Please try again shortly."
         )
-    matched_schemes = search_schemes_in_cache(query)
     if not matched_schemes:
         raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No schemes found matching '{query}'")
     return {
         "query": query,
         "matched_count": len(matched_schemes),
         "results": matched_schemes
     }
-# @router.get("/semantic-search", summary="Perform semantic search on schemes")
-# def semantic_search_endpoint(query: str = Query(...)):
-#     """
-#     Performs a semantic search on schemes using an external vector_ops module.
-#     """
-#     results = search_scheme(query)
-#     return {"query": query, "results": results}

+from fastapi import APIRouter, HTTPException, Query, Path, status
 import urllib.parse
+from api.services.scheme_service import (
+    get_all_schemes,
+    get_schemes_by_state,
+    get_scheme_details_by_title,
+    search_schemes_in_cache,
+    get_cache_loading_status
+)
 router = APIRouter()
+# -------------------------
+# Schemes endpoints with language
+# -------------------------
+@router.get("/{lang}/schemes", summary="Get all schemes grouped by state")
+def get_all_schemes_grouped_by_state_endpoint(lang: str = Path(..., description="Language code, e.g., en, hi")):
     """
     Returns all schemes grouped by state from the in-memory cache.
     """
             detail="Schemes cache is currently loading. Please try again shortly."
         )
+    schemes = get_all_schemes(lang=lang)
     if not schemes:
         raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="No schemes found in cache.")
     return schemes
+@router.get("/{lang}/schemes/{state}", summary="Get schemes for a specific state")
+def get_scheme_titles_by_state_endpoint(
+    lang: str = Path(..., description="Language code, e.g., en, hi"),
+    state: str = Path(..., description="State name")
+):
     """
     Returns all schemes for a specific state from the in-memory cache.
     """
             detail="Schemes cache is currently loading. Please try again shortly."
         )
+    schemes_for_state = get_schemes_by_state(state, lang=lang)
     if not schemes_for_state:
         raise HTTPException(
             status_code=status.HTTP_404_NOT_FOUND,
     }
+@router.get("/{lang}/schemes/{state}/scheme_titles/{title}", summary="Get details for a single scheme by title")
+def get_scheme_details_endpoint(
+    lang: str = Path(..., description="Language code, e.g., en, hi"),
+    state: str = Path(..., description="State name"),
+    title: str = Path(..., description="Scheme title")
+):
     """
     Returns details for a single scheme by title within a specific state from the in-memory cache.
     """
         )
     decoded_title = urllib.parse.unquote(title)
+    scheme_details = get_scheme_details_by_title(state, decoded_title, lang=lang)
     if not scheme_details:
         raise HTTPException(
     return scheme_details
+@router.get("/{lang}/searchscheme", summary="Search schemes by keyword across all states")
+def search_schemes_endpoint(
+    lang: str = Path(..., description="Language code, e.g., en, hi"),
+    query: str = Query(..., description="Search across all schemes")
+):
     """
     Searches schemes across all states using the in-memory cache for smooth performance.
     """
             detail="Schemes cache is currently loading. Please try again shortly."
         )
+    matched_schemes = search_schemes_in_cache(query, lang=lang)
     if not matched_schemes:
         raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No schemes found matching '{query}'")
     return {
+        "lang": lang,
         "query": query,
         "matched_count": len(matched_schemes),
         "results": matched_schemes
     }

api/services/scheme_service.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import logging
-from api.core.firebase_utils import get_firestore_db # Changed import path
 logger = logging.getLogger(__name__)
@@ -21,7 +22,7 @@ async def load_all_schemes_into_cache():
     is_cache_loading = True
     logger.info("Starting to load all schemes into cache from Firestore...")
     temp_schemes_cache = {}
-    db = get_firestore_db() # Get the initialized DB client
     if not db:
         logger.error("Firestore DB client is not available. Cannot load schemes into cache.")
@@ -29,22 +30,22 @@ async def load_all_schemes_into_cache():
         return
     try:
-        state_docs = db.collection("schemes").stream() # Get all state documents
         for state_doc in state_docs:
-            state_name = state_doc.id
-            scheme_ref = db.collection("schemes").document(state_name).collection("schemes")
             scheme_docs = scheme_ref.stream()
             schemes_in_state = []
             for scheme_doc in scheme_docs:
                 data = scheme_doc.to_dict()
-                data["id"] = scheme_doc.id # Add document ID to the data
                 schemes_in_state.append(data)
             temp_schemes_cache[state_name] = schemes_in_state
-        # Atomically update the global cache after successful fetch
         cached_all_schemes = temp_schemes_cache
         logger.info(f"Cache loaded successfully. Total states: {len(cached_all_schemes)}")
@@ -54,45 +55,131 @@ async def load_all_schemes_into_cache():
         is_cache_loading = False
-def get_all_schemes():
-    """Returns all schemes from the in-memory cache."""
-    return cached_all_schemes
-def get_schemes_by_state(state: str):
-    """Returns schemes for a specific state from the in-memory cache."""
-    state_capitalized = state.capitalize()
-    return cached_all_schemes.get(state_capitalized)
-def get_scheme_details_by_title(state: str, title: str):
-    """Returns details for a single scheme by title within a specific state."""
-    state_capitalized = state.capitalize()
-    schemes_for_state = cached_all_schemes.get(state_capitalized)
-    if schemes_for_state:
-        for scheme in schemes_for_state:
-            if scheme.get("id") == title:
-                return scheme
-    return None
-def search_schemes_in_cache(query: str):
-    """Searches schemes across all states within the in-memory cache."""
     search_query = query.strip().lower()
     matched = []
-    logger.info(f"Starting search for query: '{search_query}' across {len(cached_all_schemes)} states.")
     for state_name, schemes in cached_all_schemes.items():
         for scheme in schemes:
-            title = scheme.get("Title", "")
-            description = scheme.get("Description", "")
-            if search_query in title.lower() or search_query in description.lower():
                 result = scheme.copy()
                 result["state"] = state_name
                 matched.append(result)
     logger.info(f"Search for '{query}' completed. Found {len(matched)} matches.")
     return matched
 def get_cache_loading_status():
     """Returns the current loading status of the cache."""
-    return is_cache_loading

 import asyncio
 import logging
+from difflib import SequenceMatcher
+from api.core.firebase_utils import get_firestore_db
 logger = logging.getLogger(__name__)
     is_cache_loading = True
     logger.info("Starting to load all schemes into cache from Firestore...")
     temp_schemes_cache = {}
+    db = get_firestore_db()
     if not db:
         logger.error("Firestore DB client is not available. Cannot load schemes into cache.")
         return
     try:
+        # Fetch all state docs
+        state_docs = db.collection("schemes").stream()
         for state_doc in state_docs:
+            state_name = state_doc.id.strip().lower()  # store lowercase for consistency
+            scheme_ref = db.collection("schemes").document(state_doc.id).collection("schemes")
             scheme_docs = scheme_ref.stream()
             schemes_in_state = []
             for scheme_doc in scheme_docs:
                 data = scheme_doc.to_dict()
+                data["id"] = scheme_doc.id
                 schemes_in_state.append(data)
             temp_schemes_cache[state_name] = schemes_in_state
         cached_all_schemes = temp_schemes_cache
         logger.info(f"Cache loaded successfully. Total states: {len(cached_all_schemes)}")
         is_cache_loading = False
+# def get_all_schemes(lang=None):
+#     """Returns all schemes from the in-memory cache. If lang is provided, filter by language."""
+#     if not lang:
+#         return cached_all_schemes
+#     filtered_cache = {}
+#     for state, schemes in cached_all_schemes.items():
+#         filtered = [s for s in schemes if s.get("language", lang) == lang]
+#         if filtered:
+#             filtered_cache[state] = filtered
+#     return filtered_cache
+def get_all_schemes(lang=None):
+    """
+    Returns all schemes from the in-memory cache.
+    If lang is provided, return all schemes that either match lang OR don't have language set.
+    """
+    if not lang:
+        return cached_all_schemes
+    filtered_cache = {}
+    for state, schemes in cached_all_schemes.items():
+        filtered = [
+            s for s in schemes
+            if not s.get("language") or s.get("language", "").lower() == lang.lower()
+        ]
+        if filtered:
+            filtered_cache[state] = filtered
+    return filtered_cache
+def search_schemes_in_cache(query: str, lang: str = None):
+    """
+    Searches schemes across all states within the in-memory cache with basic stemming.
+    Automatically includes schemes that don't have a language field if lang is provided.
+    """
+    from difflib import SequenceMatcher
     search_query = query.strip().lower()
     matched = []
+    # Create variations of the query for simple stemming
+    search_terms = [search_query]
+    if search_query.endswith('ies'):
+        search_terms.append(search_query[:-3] + 'y')
+    elif search_query.endswith('s'):
+        search_terms.append(search_query[:-1])
+    logger.info(f"Starting smart search for terms: {search_terms}...")
     for state_name, schemes in cached_all_schemes.items():
         for scheme in schemes:
+            # Language filter: include scheme if language matches OR no language specified
+            language = scheme.get("language", "")
+            if lang and language and language.lower() != lang.lower():
+                continue
+            # Combine all searchable fields
+            searchable_parts = [
+                scheme.get("Title", ""),
+                scheme.get("Description", ""),
+                scheme.get("Tags", ""),
+            ]
+            list_fields_to_search = ["Eligibility", "Benefits", "Details", "Documents Required"]
+            for field in list_fields_to_search:
+                items = scheme.get(field, [])
+                if isinstance(items, list):
+                    searchable_parts.extend(items)
+                elif isinstance(items, str):
+                    searchable_parts.append(items)
+            searchable_text = " ".join(searchable_parts).lower()
+            # Check if any search term is contained or fuzzy match (for typos)
+            if any(term in searchable_text for term in search_terms) or \
+               any(SequenceMatcher(None, term, searchable_text).ratio() > 0.7 for term in search_terms):
                 result = scheme.copy()
                 result["state"] = state_name
                 matched.append(result)
+                # Don't break; allow multiple schemes per state if needed
     logger.info(f"Search for '{query}' completed. Found {len(matched)} matches.")
     return matched
+def get_schemes_by_state(state: str, lang: str = None):
+    """
+    Returns schemes for a specific state from the in-memory cache.
+    """
+    state_key = state.strip().lower()
+    schemes = cached_all_schemes.get(state_key)
+    if not schemes:
+        return None
+    if lang:
+        return [s for s in schemes if s.get("language", "").lower() == lang.lower()]
+    return schemes
+def get_scheme_details_by_title(state: str, title: str, lang: str = None):
+    """
+    Returns details for a single scheme by title or id within a specific state.
+    """
+    state_key = state.strip().lower()
+    schemes_for_state = cached_all_schemes.get(state_key)
+    if not schemes_for_state:
+        return None
+    url_title_clean = title.strip().lower()
+    for scheme in schemes_for_state:
+        db_id_clean = scheme.get("id", "").strip().lower()
+        db_title_clean = scheme.get("Title", "").strip().lower()
+        if db_id_clean == url_title_clean or db_title_clean == url_title_clean:
+            # THIS IS THE CORRECTED LANGUAGE CHECK:
+            # It now correctly handles schemes that don't have a language field.
+            if not lang or scheme.get("language", lang).lower() == lang.lower():
+                return scheme
+    return None
 def get_cache_loading_status():
     """Returns the current loading status of the cache."""
+    return is_cache_loading