VJnCode commited on
Commit
1c3cab3
·
1 Parent(s): ce60e7f

added klang to routes

Browse files
api/main.py CHANGED
@@ -1,70 +1,83 @@
 
 
1
  from fastapi import FastAPI, HTTPException, status
2
- import asyncio
3
  import logging
4
- from api.routes import endpoints # Changed import path
5
- from api.core.firebase_utils import db, initialize_firebase # Changed import path
6
- from api.services.scheme_service import load_all_schemes_into_cache, is_cache_loading, cached_all_schemes
7
 
8
- from api.routes import rag_route
 
 
9
  from fastapi.middleware.cors import CORSMiddleware
10
 
11
  # Configure logging
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
14
 
15
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
 
 
 
 
 
 
 
 
17
  app.add_middleware(
18
  CORSMiddleware,
19
- allow_origins=["*"],
20
  allow_credentials=True,
21
- allow_methods=["*"],
22
- allow_headers=["*"],
23
  )
24
 
25
- # --- Application Startup Event ---
26
- @app.on_event("startup")
27
- async def startup_event():
28
- """
29
- Called when the FastAPI application starts.
30
- Initializes Firebase and initiates the loading of schemes into the cache.
31
- """
32
- initialize_firebase()
33
- # Start cache loading in the background
34
- asyncio.create_task(load_all_schemes_into_cache())
35
- logger.info("Application startup: Initiated cache loading.")
36
-
37
- # --- API Endpoints (include routers) ---
38
- app.include_router(endpoints.router)
39
-
40
  app.include_router(rag_route.router, prefix="/api", tags=["RAG Chatbot"])
41
 
 
42
  @app.get("/")
43
  def root():
44
  """Welcome message for the API."""
45
  return {"message": "Welcome to Chathur API"}
46
 
47
- # Optional: You might still want to expose cache status or trigger refresh directly from main,
48
- # or keep them within the service layer and expose through a router.
49
- @app.get("/cache_status")
50
  def get_cache_status():
51
  """Returns the current status of the scheme cache."""
52
  return {
53
- "cache_loaded": bool(cached_all_schemes),
54
- "cache_loading": is_cache_loading,
55
  "states_in_cache": len(cached_all_schemes)
56
  }
57
 
58
- @app.post("/schemes/refresh_cache")
59
  async def refresh_schemes_cache():
60
  """
61
- Manually triggers a refresh of the in-memory schemes cache from Firestore.
62
- Use this endpoint if your Firestore data changes and you need the API to reflect it immediately.
63
  """
64
- if is_cache_loading:
65
  raise HTTPException(
66
  status_code=status.HTTP_409_CONFLICT,
67
  detail="Cache refresh already in progress."
68
  )
69
- asyncio.create_task(load_all_schemes_into_cache()) # Trigger in background
70
- return {"message": "Schemes cache refresh initiated. It will be updated shortly."}
 
 
 
1
+ # main.py
2
+
3
  from fastapi import FastAPI, HTTPException, status
4
+ from contextlib import asynccontextmanager
5
  import logging
 
 
 
6
 
7
+ from api.routes import endpoints, rag_route
8
+ from api.core.firebase_utils import initialize_firebase
9
+ from api.services.scheme_service import load_all_schemes_into_cache, is_cache_loading, cached_all_schemes
10
  from fastapi.middleware.cors import CORSMiddleware
11
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
 
16
+ # --- Lifespan Manager for Startup/Shutdown Events ---
17
+ @asynccontextmanager
18
+ async def lifespan(app: FastAPI):
19
+ """
20
+ Handles application startup and shutdown events.
21
+ """
22
+ # === Code to run on startup ===
23
+ logger.info("Application startup sequence initiated...")
24
+ initialize_firebase()
25
+ # 'await' ensures this task completes BEFORE the application starts accepting requests.
26
+ await load_all_schemes_into_cache()
27
+ logger.info("Application startup complete: Firebase initialized and cache fully loaded.")
28
+
29
+ yield # The application is now running
30
+
31
+ # === Code to run on shutdown (optional) ===
32
+ logger.info("Application shutting down.")
33
 
34
+ # Create the FastAPI app instance with the lifespan manager
35
+ app = FastAPI(
36
+ title="Chathur API",
37
+ description="API for government schemes and RAG chatbot.",
38
+ lifespan=lifespan
39
+ )
40
+
41
+ # --- Middleware ---
42
  app.add_middleware(
43
  CORSMiddleware,
44
+ allow_origins=["*"], # Allows all origins
45
  allow_credentials=True,
46
+ allow_methods=["*"], # Allows all methods
47
+ allow_headers=["*"], # Allows all headers
48
  )
49
 
50
+ # --- API Endpoints (Routers) ---
51
+ # Note: The 'endpoints.router' does not have a prefix, so its routes are at the root.
52
+ app.include_router(endpoints.router)
 
 
 
 
 
 
 
 
 
 
 
 
53
  app.include_router(rag_route.router, prefix="/api", tags=["RAG Chatbot"])
54
 
55
+ # --- Root and Utility Endpoints ---
56
  @app.get("/")
57
  def root():
58
  """Welcome message for the API."""
59
  return {"message": "Welcome to Chathur API"}
60
 
61
+ @app.get("/cache-status", tags=["Cache Management"])
 
 
62
  def get_cache_status():
63
  """Returns the current status of the scheme cache."""
64
  return {
65
+ "is_cache_loading": is_cache_loading(),
66
+ "is_cache_populated": bool(cached_all_schemes),
67
  "states_in_cache": len(cached_all_schemes)
68
  }
69
 
70
+ @app.post("/schemes/refresh-cache", status_code=status.HTTP_202_ACCEPTED, tags=["Cache Management"])
71
  async def refresh_schemes_cache():
72
  """
73
+ Manually triggers a background refresh of the in-memory schemes cache.
 
74
  """
75
+ if is_cache_loading():
76
  raise HTTPException(
77
  status_code=status.HTTP_409_CONFLICT,
78
  detail="Cache refresh already in progress."
79
  )
80
+ # create_task is appropriate here because it's a manual trigger;
81
+ # we want to return a response immediately, not wait for the refresh.
82
+ asyncio.create_task(load_all_schemes_into_cache())
83
+ return {"message": "Schemes cache refresh initiated in the background."}
api/rag/IndicTrans2 ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 53fd3e9df8ca5a5fc9d92f45027959f0b0e0b14f
api/rag/IndicTransToolkit ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 3efb8418d0721b4ce267c2b3586899d313191357
api/rag/rag.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
api/rag/trail.ipynb ADDED
@@ -0,0 +1,634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 5,
6
+ "id": "5c1018e2",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "[WinError 3] The system cannot find the path specified: '/content/IndicTrans2/huggingface_interface'\n",
14
+ "d:\\Major Project\\Chathur\\Bakend_HuggingFace\\api\\rag\n"
15
+ ]
16
+ },
17
+ {
18
+ "name": "stderr",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "fatal: destination path 'IndicTrans2' already exists and is not an empty directory.\n"
22
+ ]
23
+ }
24
+ ],
25
+ "source": [
26
+ "!git clone https://github.com/AI4Bharat/IndicTrans2.git\n",
27
+ "%cd /content/IndicTrans2/huggingface_interface"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 6,
33
+ "id": "b4190411",
34
+ "metadata": {},
35
+ "outputs": [
36
+ {
37
+ "name": "stderr",
38
+ "output_type": "stream",
39
+ "text": [
40
+ "[nltk_data] Downloading package punkt to\n",
41
+ "[nltk_data] C:\\Users\\Hp\\AppData\\Roaming\\nltk_data...\n",
42
+ "[nltk_data] Package punkt is already up-to-date!\n"
43
+ ]
44
+ },
45
+ {
46
+ "name": "stdout",
47
+ "output_type": "stream",
48
+ "text": [
49
+ "Requirement already satisfied: bitsandbytes in d:\\major project\\chathur\\.venv\\lib\\site-packages (0.47.0)\n",
50
+ "Requirement already satisfied: scipy in d:\\major project\\chathur\\.venv\\lib\\site-packages (1.16.1)\n",
51
+ "Requirement already satisfied: accelerate in d:\\major project\\chathur\\.venv\\lib\\site-packages (1.10.1)\n",
52
+ "Requirement already satisfied: datasets in d:\\major project\\chathur\\.venv\\lib\\site-packages (4.0.0)\n",
53
+ "Requirement already satisfied: torch<3,>=2.2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from bitsandbytes) (2.8.0)\n",
54
+ "Requirement already satisfied: numpy>=1.17 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from bitsandbytes) (2.3.2)\n",
55
+ "Requirement already satisfied: filelock in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (3.19.1)\n",
56
+ "Requirement already satisfied: typing-extensions>=4.10.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (4.15.0)\n",
57
+ "Requirement already satisfied: sympy>=1.13.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (1.14.0)\n",
58
+ "Requirement already satisfied: networkx in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (3.5)\n",
59
+ "Requirement already satisfied: jinja2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (3.1.6)\n",
60
+ "Requirement already satisfied: fsspec in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (2025.3.0)\n",
61
+ "Requirement already satisfied: setuptools in d:\\major project\\chathur\\.venv\\lib\\site-packages (from torch<3,>=2.2->bitsandbytes) (80.9.0)\n",
62
+ "Requirement already satisfied: packaging>=20.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from accelerate) (24.2)\n",
63
+ "Requirement already satisfied: psutil in d:\\major project\\chathur\\.venv\\lib\\site-packages (from accelerate) (7.0.0)\n",
64
+ "Requirement already satisfied: pyyaml in d:\\major project\\chathur\\.venv\\lib\\site-packages (from accelerate) (6.0.2)\n",
65
+ "Requirement already satisfied: huggingface_hub>=0.21.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from accelerate) (0.34.4)\n",
66
+ "Requirement already satisfied: safetensors>=0.4.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from accelerate) (0.6.2)\n",
67
+ "Requirement already satisfied: pyarrow>=15.0.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (21.0.0)\n",
68
+ "Requirement already satisfied: dill<0.3.9,>=0.3.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (0.3.8)\n",
69
+ "Requirement already satisfied: pandas in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (2.3.2)\n",
70
+ "Requirement already satisfied: requests>=2.32.2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (2.32.5)\n",
71
+ "Requirement already satisfied: tqdm>=4.66.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (4.67.1)\n",
72
+ "Requirement already satisfied: xxhash in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (3.5.0)\n",
73
+ "Requirement already satisfied: multiprocess<0.70.17 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from datasets) (0.70.16)\n",
74
+ "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.12.15)\n",
75
+ "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2.6.1)\n",
76
+ "Requirement already satisfied: aiosignal>=1.4.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.4.0)\n",
77
+ "Requirement already satisfied: attrs>=17.3.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (25.3.0)\n",
78
+ "Requirement already satisfied: frozenlist>=1.1.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.7.0)\n",
79
+ "Requirement already satisfied: multidict<7.0,>=4.5 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (6.6.4)\n",
80
+ "Requirement already satisfied: propcache>=0.2.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (0.3.2)\n",
81
+ "Requirement already satisfied: yarl<2.0,>=1.17.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.20.1)\n",
82
+ "Requirement already satisfied: idna>=2.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from yarl<2.0,>=1.17.0->aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.10)\n",
83
+ "Requirement already satisfied: charset_normalizer<4,>=2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.32.2->datasets) (3.4.3)\n",
84
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.32.2->datasets) (2.5.0)\n",
85
+ "Requirement already satisfied: certifi>=2017.4.17 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.32.2->datasets) (2025.8.3)\n",
86
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sympy>=1.13.3->torch<3,>=2.2->bitsandbytes) (1.3.0)\n",
87
+ "Requirement already satisfied: colorama in d:\\major project\\chathur\\.venv\\lib\\site-packages (from tqdm>=4.66.3->datasets) (0.4.6)\n",
88
+ "Requirement already satisfied: MarkupSafe>=2.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from jinja2->torch<3,>=2.2->bitsandbytes) (3.0.2)\n",
89
+ "Requirement already satisfied: python-dateutil>=2.8.2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->datasets) (2.9.0.post0)\n",
90
+ "Requirement already satisfied: pytz>=2020.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->datasets) (2025.2)\n",
91
+ "Requirement already satisfied: tzdata>=2022.7 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->datasets) (2025.2)\n",
92
+ "Requirement already satisfied: six>=1.5 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n",
93
+ "Requirement already satisfied: sentencepiece in d:\\major project\\chathur\\.venv\\lib\\site-packages (0.2.1)\n",
94
+ "d:\\Major Project\\Chathur\\Bakend_HuggingFace\\api\\rag\\IndicTransToolkit\n"
95
+ ]
96
+ },
97
+ {
98
+ "name": "stderr",
99
+ "output_type": "stream",
100
+ "text": [
101
+ "fatal: destination path 'IndicTransToolkit' already exists and is not an empty directory.\n"
102
+ ]
103
+ },
104
+ {
105
+ "name": "stdout",
106
+ "output_type": "stream",
107
+ "text": [
108
+ "Obtaining file:///D:/Major%20Project/Chathur/Bakend_HuggingFace/api/rag/IndicTransToolkit\n",
109
+ " Installing build dependencies: started\n",
110
+ " Installing build dependencies: finished with status 'done'\n",
111
+ " Checking if build backend supports build_editable: started\n",
112
+ " Checking if build backend supports build_editable: finished with status 'done'\n",
113
+ " Getting requirements to build editable: started\n",
114
+ " Getting requirements to build editable: finished with status 'done'\n",
115
+ " Preparing editable metadata (pyproject.toml): started\n",
116
+ " Preparing editable metadata (pyproject.toml): finished with status 'done'\n",
117
+ "Requirement already satisfied: cython in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indictranstoolkit==1.1.1) (3.1.3)\n",
118
+ "Requirement already satisfied: sacremoses in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indictranstoolkit==1.1.1) (0.1.1)\n",
119
+ "Requirement already satisfied: transformers in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indictranstoolkit==1.1.1) (4.55.4)\n",
120
+ "Requirement already satisfied: sacrebleu in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indictranstoolkit==1.1.1) (2.5.1)\n",
121
+ "Requirement already satisfied: indic-nlp-library-itt in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indictranstoolkit==1.1.1) (0.1.1)\n",
122
+ "Requirement already satisfied: morfessor in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.0.6)\n",
123
+ "Requirement already satisfied: numpy in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.3.2)\n",
124
+ "Requirement already satisfied: pandas in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.3.2)\n",
125
+ "Requirement already satisfied: sphinx-argparse in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indic-nlp-library-itt->indictranstoolkit==1.1.1) (0.5.2)\n",
126
+ "Requirement already satisfied: sphinx-rtd-theme in d:\\major project\\chathur\\.venv\\lib\\site-packages (from indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.0.2)\n",
127
+ "Requirement already satisfied: python-dateutil>=2.8.2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.9.0.post0)\n",
128
+ "Requirement already satisfied: pytz>=2020.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2025.2)\n",
129
+ "Requirement already satisfied: tzdata>=2022.7 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from pandas->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2025.2)\n",
130
+ "Requirement already satisfied: six>=1.5 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->indic-nlp-library-itt->indictranstoolkit==1.1.1) (1.17.0)\n",
131
+ "Requirement already satisfied: portalocker in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacrebleu->indictranstoolkit==1.1.1) (3.2.0)\n",
132
+ "Requirement already satisfied: regex in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacrebleu->indictranstoolkit==1.1.1) (2025.7.34)\n",
133
+ "Requirement already satisfied: tabulate>=0.8.9 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacrebleu->indictranstoolkit==1.1.1) (0.9.0)\n",
134
+ "Requirement already satisfied: colorama in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacrebleu->indictranstoolkit==1.1.1) (0.4.6)\n",
135
+ "Requirement already satisfied: lxml in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacrebleu->indictranstoolkit==1.1.1) (6.0.1)\n",
136
+ "Requirement already satisfied: pywin32>=226 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from portalocker->sacrebleu->indictranstoolkit==1.1.1) (311)\n",
137
+ "Requirement already satisfied: click in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacremoses->indictranstoolkit==1.1.1) (8.2.1)\n",
138
+ "Requirement already satisfied: joblib in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacremoses->indictranstoolkit==1.1.1) (1.5.2)\n",
139
+ "Requirement already satisfied: tqdm in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sacremoses->indictranstoolkit==1.1.1) (4.67.1)\n",
140
+ "Requirement already satisfied: sphinx>=5.1.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (8.2.3)\n",
141
+ "Requirement already satisfied: docutils>=0.19 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (0.21.2)\n",
142
+ "Requirement already satisfied: sphinxcontrib-applehelp>=1.0.7 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.0.0)\n",
143
+ "Requirement already satisfied: sphinxcontrib-devhelp>=1.0.6 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.0.0)\n",
144
+ "Requirement already satisfied: sphinxcontrib-htmlhelp>=2.0.6 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.1.0)\n",
145
+ "Requirement already satisfied: sphinxcontrib-jsmath>=1.0.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (1.0.1)\n",
146
+ "Requirement already satisfied: sphinxcontrib-qthelp>=1.0.6 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.0.0)\n",
147
+ "Requirement already satisfied: sphinxcontrib-serializinghtml>=1.1.9 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.0.0)\n",
148
+ "Requirement already satisfied: Jinja2>=3.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.1.6)\n",
149
+ "Requirement already satisfied: Pygments>=2.17 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.19.2)\n",
150
+ "Requirement already satisfied: snowballstemmer>=2.2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.0.1)\n",
151
+ "Requirement already satisfied: babel>=2.13 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.17.0)\n",
152
+ "Requirement already satisfied: alabaster>=0.7.14 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (1.0.0)\n",
153
+ "Requirement already satisfied: imagesize>=1.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (1.4.1)\n",
154
+ "Requirement already satisfied: requests>=2.30.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.32.5)\n",
155
+ "Requirement already satisfied: roman-numerals-py>=1.0.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.1.0)\n",
156
+ "Requirement already satisfied: packaging>=23.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (24.2)\n",
157
+ "Requirement already satisfied: MarkupSafe>=2.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from Jinja2>=3.1->sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.0.2)\n",
158
+ "Requirement already satisfied: charset_normalizer<4,>=2 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.30.0->sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.4.3)\n",
159
+ "Requirement already satisfied: idna<4,>=2.5 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.30.0->sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (3.10)\n",
160
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.30.0->sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2.5.0)\n",
161
+ "Requirement already satisfied: certifi>=2017.4.17 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from requests>=2.30.0->sphinx>=5.1.0->sphinx-argparse->indic-nlp-library-itt->indictranstoolkit==1.1.1) (2025.8.3)\n",
162
+ "Requirement already satisfied: sphinxcontrib-jquery<5,>=4 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from sphinx-rtd-theme->indic-nlp-library-itt->indictranstoolkit==1.1.1) (4.1)\n",
163
+ "Requirement already satisfied: filelock in d:\\major project\\chathur\\.venv\\lib\\site-packages (from transformers->indictranstoolkit==1.1.1) (3.19.1)\n",
164
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.34.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from transformers->indictranstoolkit==1.1.1) (0.34.4)\n",
165
+ "Requirement already satisfied: pyyaml>=5.1 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from transformers->indictranstoolkit==1.1.1) (6.0.2)\n",
166
+ "Requirement already satisfied: tokenizers<0.22,>=0.21 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from transformers->indictranstoolkit==1.1.1) (0.21.4)\n",
167
+ "Requirement already satisfied: safetensors>=0.4.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from transformers->indictranstoolkit==1.1.1) (0.6.2)\n",
168
+ "Requirement already satisfied: fsspec>=2023.5.0 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from huggingface-hub<1.0,>=0.34.0->transformers->indictranstoolkit==1.1.1) (2025.3.0)\n",
169
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in d:\\major project\\chathur\\.venv\\lib\\site-packages (from huggingface-hub<1.0,>=0.34.0->transformers->indictranstoolkit==1.1.1) (4.15.0)\n",
170
+ "Building wheels for collected packages: indictranstoolkit\n",
171
+ " Building editable for indictranstoolkit (pyproject.toml): started\n",
172
+ " Building editable for indictranstoolkit (pyproject.toml): finished with status 'error'\n",
173
+ "Failed to build indictranstoolkit\n",
174
+ "d:\\Major Project\\Chathur\\Bakend_HuggingFace\\api\\rag\n"
175
+ ]
176
+ },
177
+ {
178
+ "name": "stderr",
179
+ "output_type": "stream",
180
+ "text": [
181
+ " error: subprocess-exited-with-error\n",
182
+ " \n",
183
+ " × Building editable for indictranstoolkit (pyproject.toml) did not run successfully.\n",
184
+ " │ exit code: 1\n",
185
+ " ╰─> [69 lines of output]\n",
186
+ " C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-build-env-fz7kfyri\\overlay\\Lib\\site-packages\\setuptools\\config\\_apply_pyprojecttoml.py:82: SetuptoolsDeprecationWarning: `project.license` as a TOML table is deprecated\n",
187
+ " !!\n",
188
+ " \n",
189
+ " ********************************************************************************\n",
190
+ " Please use a simple string containing a SPDX expression for `project.license`. You can also use `project.license-files`. (Both options available on setuptools>=77.0.0).\n",
191
+ " \n",
192
+ " By 2026-Feb-18, you need to update your project and remove deprecated calls\n",
193
+ " or your builds will no longer be supported.\n",
194
+ " \n",
195
+ " See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.\n",
196
+ " ********************************************************************************\n",
197
+ " \n",
198
+ " !!\n",
199
+ " corresp(dist, value, root_dir)\n",
200
+ " C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-build-env-fz7kfyri\\overlay\\Lib\\site-packages\\setuptools\\config\\_apply_pyprojecttoml.py:61: SetuptoolsDeprecationWarning: License classifiers are deprecated.\n",
201
+ " !!\n",
202
+ " \n",
203
+ " ********************************************************************************\n",
204
+ " Please consider removing the following classifiers in favor of a SPDX license expression:\n",
205
+ " \n",
206
+ " License :: OSI Approved :: MIT License\n",
207
+ " \n",
208
+ " See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.\n",
209
+ " ********************************************************************************\n",
210
+ " \n",
211
+ " !!\n",
212
+ " dist._finalize_license_expression()\n",
213
+ " C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-build-env-fz7kfyri\\overlay\\Lib\\site-packages\\setuptools\\dist.py:759: SetuptoolsDeprecationWarning: License classifiers are deprecated.\n",
214
+ " !!\n",
215
+ " \n",
216
+ " ********************************************************************************\n",
217
+ " Please consider removing the following classifiers in favor of a SPDX license expression:\n",
218
+ " \n",
219
+ " License :: OSI Approved :: MIT License\n",
220
+ " \n",
221
+ " See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.\n",
222
+ " ********************************************************************************\n",
223
+ " \n",
224
+ " !!\n",
225
+ " self._finalize_license_expression()\n",
226
+ " running editable_wheel\n",
227
+ " creating C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\n",
228
+ " writing C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\PKG-INFO\n",
229
+ " writing dependency_links to C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\dependency_links.txt\n",
230
+ " writing requirements to C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\requires.txt\n",
231
+ " writing top-level names to C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\top_level.txt\n",
232
+ " writing manifest file 'C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\SOURCES.txt'\n",
233
+ " reading manifest file 'C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\SOURCES.txt'\n",
234
+ " reading manifest template 'MANIFEST.in'\n",
235
+ " warning: no files found matching '*.so' under directory 'IndicTransToolkit'\n",
236
+ " adding license file 'LICENSE'\n",
237
+ " writing manifest file 'C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit.egg-info\\SOURCES.txt'\n",
238
+ " creating 'C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit-1.1.1.dist-info'\n",
239
+ " creating C:\\Users\\Hp\\AppData\\Local\\Temp\\pip-wheel-2ufwhvg9\\.tmp-a00g4gbl\\indictranstoolkit-1.1.1.dist-info\\WHEEL\n",
240
+ " running build_py\n",
241
+ " running build_ext\n",
242
+ " building 'IndicTransToolkit.processor' extension\n",
243
+ " creating C:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\n",
244
+ " \"C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\BuildTools\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX86\\x64\\cl.exe\" /c /nologo /O2 /W3 /GL /DNDEBUG /MD \"-Id:\\Major Project\\Chathur\\.venv\\include\" -ID:\\SOFTWARE\\Python\\include -ID:\\SOFTWARE\\Python\\Include \"-IC:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\BuildTools\\VC\\Tools\\MSVC\\14.29.30133\\include\" \"-IC:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.19041.0\\ucrt\" \"-IC:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.19041.0\\shared\" \"-IC:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.19041.0\\um\" \"-IC:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.19041.0\\winrt\" \"-IC:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.19041.0\\cppwinrt\" /TcIndicTransToolkit/processor.c /FoC:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\\processor.obj\n",
245
+ " processor.c\n",
246
+ " IndicTransToolkit/processor.c(7951): warning C4244: '=': conversion from 'Py_ssize_t' to 'int', possible loss of data\n",
247
+ " IndicTransToolkit/processor.c(8597): warning C4244: '=': conversion from 'Py_ssize_t' to 'int', possible loss of data\n",
248
+ " creating C:\\Users\\Hp\\AppData\\Local\\Temp\\tmp645n8bz_.build-lib\\IndicTransToolkit\n",
249
+ " \"C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\BuildTools\\VC\\Tools\\MSVC\\14.29.30133\\bin\\HostX86\\x64\\link.exe\" /nologo /INCREMENTAL:NO /LTCG /DLL /MANIFEST:EMBED,ID=2 /MANIFESTUAC:NO \"/LIBPATH:d:\\Major Project\\Chathur\\.venv\\libs\" /LIBPATH:D:\\SOFTWARE\\Python\\libs /LIBPATH:D:\\SOFTWARE\\Python \"/LIBPATH:d:\\Major Project\\Chathur\\.venv\\PCbuild\\amd64\" \"/LIBPATH:C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\BuildTools\\VC\\Tools\\MSVC\\14.29.30133\\lib\\x64\" \"/LIBPATH:C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.19041.0\\ucrt\\x64\" \"/LIBPATH:C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.19041.0\\um\\x64\" /EXPORT:PyInit_processor C:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\\processor.obj /OUT:C:\\Users\\Hp\\AppData\\Local\\Temp\\tmp645n8bz_.build-lib\\IndicTransToolkit\\processor.cp313-win_amd64.pyd /IMPLIB:C:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\\processor.cp313-win_amd64.lib\n",
250
+ " Creating library C:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\\processor.cp313-win_amd64.lib and object C:\\Users\\Hp\\AppData\\Local\\Temp\\tmpf09qp786.build-temp\\Release\\IndicTransToolkit\\processor.cp313-win_amd64.exp\n",
251
+ " Generating code\n",
252
+ " Finished generating code\n",
253
+ " copying C:\\Users\\Hp\\AppData\\Local\\Temp\\tmp645n8bz_.build-lib\\IndicTransToolkit\\processor.cp313-win_amd64.pyd -> IndicTransToolkit\n",
254
+ " error: could not delete 'IndicTransToolkit\\processor.cp313-win_amd64.pyd': Access is denied\n",
255
+ " [end of output]\n",
256
+ " \n",
257
+ " note: This error originates from a subprocess, and is likely not a problem with pip.\n",
258
+ " ERROR: Failed building editable for indictranstoolkit\n",
259
+ "error: failed-wheel-build-for-install\n",
260
+ "\n",
261
+ "× Failed to build installable wheels for some pyproject.toml based projects\n",
262
+ "╰─> indictranstoolkit\n"
263
+ ]
264
+ }
265
+ ],
266
+ "source": [
267
+ "!python -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer\n",
268
+ "!python -c \"import nltk; nltk.download('punkt')\"\n",
269
+ "!python -m pip install bitsandbytes scipy accelerate datasets\n",
270
+ "!python -m pip install sentencepiece\n",
271
+ "\n",
272
+ "!git clone https://github.com/VarunGumma/IndicTransToolkit.git\n",
273
+ "%cd IndicTransToolkit\n",
274
+ "!python -m pip install --editable ./\n",
275
+ "%cd .."
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": 1,
281
+ "id": "81d64601",
282
+ "metadata": {},
283
+ "outputs": [],
284
+ "source": [
285
+ "import torch\n",
286
+ "from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer\n",
287
+ "from IndicTransToolkit.processor import IndicProcessor\n",
288
+ "\n",
289
+ "BATCH_SIZE = 4\n",
290
+ "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
291
+ "quantization = None"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": 2,
297
+ "id": "d260bc8d",
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": [
301
+ "def initialize_model_and_tokenizer(ckpt_dir, quantization):\n",
302
+ " if quantization == \"4-bit\":\n",
303
+ " qconfig = BitsAndBytesConfig(\n",
304
+ " load_in_4bit=True,\n",
305
+ " bnb_4bit_use_double_quant=True,\n",
306
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
307
+ " )\n",
308
+ " elif quantization == \"8-bit\":\n",
309
+ " qconfig = BitsAndBytesConfig(\n",
310
+ " load_in_8bit=True,\n",
311
+ " bnb_8bit_use_double_quant=True,\n",
312
+ " bnb_8bit_compute_dtype=torch.bfloat16,\n",
313
+ " )\n",
314
+ " else:\n",
315
+ " qconfig = None\n",
316
+ "\n",
317
+ " tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)\n",
318
+ " model = AutoModelForSeq2SeqLM.from_pretrained(\n",
319
+ " ckpt_dir,\n",
320
+ " trust_remote_code=True,\n",
321
+ " low_cpu_mem_usage=True,\n",
322
+ " quantization_config=qconfig,\n",
323
+ " )\n",
324
+ "\n",
325
+ " if qconfig == None:\n",
326
+ " model = model.to(DEVICE)\n",
327
+ " if DEVICE == \"cuda\":\n",
328
+ " model.half()\n",
329
+ "\n",
330
+ " model.eval()\n",
331
+ "\n",
332
+ " return tokenizer, model\n",
333
+ "\n",
334
+ "\n",
335
+ "def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):\n",
336
+ " translations = []\n",
337
+ " for i in range(0, len(input_sentences), BATCH_SIZE):\n",
338
+ " batch = input_sentences[i : i + BATCH_SIZE]\n",
339
+ "\n",
340
+ " # Preprocess the batch and extract entity mappings\n",
341
+ " batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)\n",
342
+ "\n",
343
+ " # Tokenize the batch and generate input encodings\n",
344
+ " inputs = tokenizer(\n",
345
+ " batch,\n",
346
+ " truncation=True,\n",
347
+ " padding=\"longest\",\n",
348
+ " return_tensors=\"pt\",\n",
349
+ " return_attention_mask=True,\n",
350
+ " ).to(DEVICE)\n",
351
+ "\n",
352
+ " # Generate translations using the model\n",
353
+ " with torch.no_grad():\n",
354
+ " generated_tokens = model.generate(\n",
355
+ " **inputs,\n",
356
+ " use_cache=True,\n",
357
+ " min_length=0,\n",
358
+ " max_length=256,\n",
359
+ " num_beams=5,\n",
360
+ " num_return_sequences=1,\n",
361
+ " )\n",
362
+ "\n",
363
+ " # Decode the generated tokens into text\n",
364
+ " generated_tokens = tokenizer.batch_decode(\n",
365
+ " generated_tokens,\n",
366
+ " skip_special_tokens=True,\n",
367
+ " clean_up_tokenization_spaces=True,\n",
368
+ " )\n",
369
+ "\n",
370
+ " # Postprocess the translations, including entity replacement\n",
371
+ " translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)\n",
372
+ "\n",
373
+ " del inputs\n",
374
+ " torch.cuda.empty_cache()\n",
375
+ "\n",
376
+ " return translations"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": 3,
382
+ "id": "634056be",
383
+ "metadata": {},
384
+ "outputs": [
385
+ {
386
+ "ename": "AssertionError",
387
+ "evalue": "Invalid source language tag: <hin_Deva>",
388
+ "output_type": "error",
389
+ "traceback": [
390
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
391
+ "\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)",
392
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 33\u001b[39m\n\u001b[32m 28\u001b[39m \u001b[38;5;66;03m# Example\u001b[39;00m\n\u001b[32m 29\u001b[39m en_sents = [\n\u001b[32m 30\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mWhen I was young, I used to go to the park every day.\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 31\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mHe has many old books, which he inherited from his ancestors.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 32\u001b[39m ]\n\u001b[32m---> \u001b[39m\u001b[32m33\u001b[39m translations = \u001b[43mbatch_translate\u001b[49m\u001b[43m(\u001b[49m\u001b[43men_sents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43meng_Latn\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mhin_Deva\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 35\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m src, tgt \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(en_sents, translations):\n\u001b[32m 36\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msrc\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m --> \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtgt\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
393
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 19\u001b[39m, in \u001b[36mbatch_translate\u001b[39m\u001b[34m(sentences, src_lang, tgt_lang)\u001b[39m\n\u001b[32m 16\u001b[39m tagged_sentences = [\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m<\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtgt_lang\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m> \u001b[39m\u001b[38;5;132;01m{\u001b[39;00ms\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m sentences]\n\u001b[32m 18\u001b[39m \u001b[38;5;66;03m# Tokenize\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m19\u001b[39m inputs = \u001b[43mtokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtagged_sentences\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mpt\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpadding\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m.to(DEVICE)\n\u001b[32m 21\u001b[39m \u001b[38;5;66;03m# Generate translations\u001b[39;00m\n\u001b[32m 22\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n",
394
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:2910\u001b[39m, in \u001b[36mPreTrainedTokenizerBase.__call__\u001b[39m\u001b[34m(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[39m\n\u001b[32m 2908\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._in_target_context_manager:\n\u001b[32m 2909\u001b[39m \u001b[38;5;28mself\u001b[39m._switch_to_input_mode()\n\u001b[32m-> \u001b[39m\u001b[32m2910\u001b[39m encodings = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_call_one\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtext_pair\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mall_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2911\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m text_target \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 2912\u001b[39m \u001b[38;5;28mself\u001b[39m._switch_to_target_mode()\n",
395
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:2998\u001b[39m, in \u001b[36mPreTrainedTokenizerBase._call_one\u001b[39m\u001b[34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)\u001b[39m\n\u001b[32m 2993\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 2994\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mbatch length of `text`: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(text)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m does not match batch length of `text_pair`:\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2995\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(text_pair)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 2996\u001b[39m )\n\u001b[32m 2997\u001b[39m batch_text_or_text_pairs = \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mzip\u001b[39m(text, text_pair)) \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m text\n\u001b[32m-> \u001b[39m\u001b[32m2998\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mbatch_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 2999\u001b[39m \u001b[43m \u001b[49m\u001b[43mbatch_text_or_text_pairs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbatch_text_or_text_pairs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3000\u001b[39m \u001b[43m \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3001\u001b[39m \u001b[43m \u001b[49m\u001b[43mpadding\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3002\u001b[39m \u001b[43m \u001b[49m\u001b[43mtruncation\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtruncation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3003\u001b[39m \u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3004\u001b[39m \u001b[43m \u001b[49m\u001b[43mstride\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3005\u001b[39m \u001b[43m \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m=\u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3006\u001b[39m \u001b[43m \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3007\u001b[39m \u001b[43m \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3008\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3009\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3010\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3011\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3012\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3013\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3014\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3015\u001b[39m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m=\u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3016\u001b[39m \u001b[43m \u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3017\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3018\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 3019\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 3020\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.encode_plus(\n\u001b[32m 3021\u001b[39m text=text,\n\u001b[32m 3022\u001b[39m text_pair=text_pair,\n\u001b[32m (...)\u001b[39m\u001b[32m 3040\u001b[39m **kwargs,\n\u001b[32m 3041\u001b[39m )\n",
396
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:3199\u001b[39m, in \u001b[36mPreTrainedTokenizerBase.batch_encode_plus\u001b[39m\u001b[34m(self, batch_text_or_text_pairs, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)\u001b[39m\n\u001b[32m 3189\u001b[39m \u001b[38;5;66;03m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\u001b[39;00m\n\u001b[32m 3190\u001b[39m padding_strategy, truncation_strategy, max_length, kwargs = \u001b[38;5;28mself\u001b[39m._get_padding_truncation_strategies(\n\u001b[32m 3191\u001b[39m padding=padding,\n\u001b[32m 3192\u001b[39m truncation=truncation,\n\u001b[32m (...)\u001b[39m\u001b[32m 3196\u001b[39m **kwargs,\n\u001b[32m 3197\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m3199\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_batch_encode_plus\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 3200\u001b[39m \u001b[43m \u001b[49m\u001b[43mbatch_text_or_text_pairs\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbatch_text_or_text_pairs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3201\u001b[39m \u001b[43m \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3202\u001b[39m \u001b[43m \u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpadding_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3203\u001b[39m \u001b[43m \u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtruncation_strategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3204\u001b[39m \u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3205\u001b[39m \u001b[43m \u001b[49m\u001b[43mstride\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3206\u001b[39m \u001b[43m \u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m=\u001b[49m\u001b[43mis_split_into_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3207\u001b[39m \u001b[43m \u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpad_to_multiple_of\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3208\u001b[39m \u001b[43m \u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpadding_side\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3209\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_tensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3210\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_token_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3211\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3212\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_overflowing_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3213\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_special_tokens_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3214\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_offsets_mapping\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3215\u001b[39m \u001b[43m \u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[43m=\u001b[49m\u001b[43mreturn_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3216\u001b[39m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m=\u001b[49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3217\u001b[39m \u001b[43m \u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[43msplit_special_tokens\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3218\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 3219\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
397
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils.py:887\u001b[39m, in \u001b[36mPreTrainedTokenizer._batch_encode_plus\u001b[39m\u001b[34m(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, padding_side, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, split_special_tokens, **kwargs)\u001b[39m\n\u001b[32m 884\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 885\u001b[39m ids, pair_ids = ids_or_pair_ids\n\u001b[32m--> \u001b[39m\u001b[32m887\u001b[39m first_ids = \u001b[43mget_input_ids\u001b[49m\u001b[43m(\u001b[49m\u001b[43mids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 888\u001b[39m second_ids = get_input_ids(pair_ids) \u001b[38;5;28;01mif\u001b[39;00m pair_ids \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 889\u001b[39m input_ids.append((first_ids, second_ids))\n",
398
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils.py:854\u001b[39m, in \u001b[36mPreTrainedTokenizer._batch_encode_plus.<locals>.get_input_ids\u001b[39m\u001b[34m(text)\u001b[39m\n\u001b[32m 852\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget_input_ids\u001b[39m(text):\n\u001b[32m 853\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text, \u001b[38;5;28mstr\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m854\u001b[39m tokens = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtokenize\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 855\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.convert_tokens_to_ids(tokens)\n\u001b[32m 856\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text, (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(text) > \u001b[32m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(text[\u001b[32m0\u001b[39m], \u001b[38;5;28mstr\u001b[39m):\n",
399
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\tokenization_utils.py:697\u001b[39m, in \u001b[36mPreTrainedTokenizer.tokenize\u001b[39m\u001b[34m(self, text, **kwargs)\u001b[39m\n\u001b[32m 695\u001b[39m tokenized_text.append(token)\n\u001b[32m 696\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m697\u001b[39m tokenized_text.extend(\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_tokenize\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[32m 698\u001b[39m \u001b[38;5;66;03m# [\"This\", \" is\", \" something\", \"<special_token_1>\", \"else\"]\u001b[39;00m\n\u001b[32m 699\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m tokenized_text\n",
400
+ "\u001b[36mFile \u001b[39m\u001b[32m~\\.cache\\huggingface\\modules\\transformers_modules\\ai4bharat\\indictrans2-en-indic-1B\\10e65a9951a1e922cd109a95e8aba9357b62144b\\tokenization_indictrans.py:201\u001b[39m, in \u001b[36mIndicTransTokenizer._src_tokenize\u001b[39m\u001b[34m(self, text)\u001b[39m\n\u001b[32m 199\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_src_tokenize\u001b[39m(\u001b[38;5;28mself\u001b[39m, text: \u001b[38;5;28mstr\u001b[39m) -> List[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[32m 200\u001b[39m src_lang, tgt_lang, text = text.split(\u001b[33m\"\u001b[39m\u001b[33m \u001b[39m\u001b[33m\"\u001b[39m, \u001b[32m2\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m201\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m src_lang \u001b[38;5;129;01min\u001b[39;00m LANGUAGE_TAGS, \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mInvalid source language tag: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msrc_lang\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 202\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m tgt_lang \u001b[38;5;129;01min\u001b[39;00m LANGUAGE_TAGS, \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mInvalid target language tag: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtgt_lang\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 203\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m [src_lang, tgt_lang] + \u001b[38;5;28mself\u001b[39m.spm.EncodeAsPieces(text)\n",
401
+ "\u001b[31mAssertionError\u001b[39m: Invalid source language tag: <hin_Deva>"
402
+ ]
403
+ }
404
+ ],
405
+ "source": [
406
+ "import torch\n",
407
+ "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
408
+ "\n",
409
+ "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
410
+ "\n",
411
+ "model_name = \"ai4bharat/indictrans2-en-indic-1B\"\n",
412
+ "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
413
+ "model = AutoModelForSeq2SeqLM.from_pretrained(\n",
414
+ " model_name,\n",
415
+ " trust_remote_code=True,\n",
416
+ " torch_dtype=torch.float32 # safer on CPU/Windows\n",
417
+ ").to(DEVICE)\n",
418
+ "\n",
419
+ "def batch_translate(sentences, src_lang, tgt_lang):\n",
420
+ " # Add target language tag to each sentence\n",
421
+ " tagged_sentences = [f\"<{tgt_lang}> {s}\" for s in sentences]\n",
422
+ "\n",
423
+ " # Tokenize\n",
424
+ " inputs = tokenizer(tagged_sentences, return_tensors=\"pt\", padding=True, truncation=True).to(DEVICE)\n",
425
+ "\n",
426
+ " # Generate translations\n",
427
+ " with torch.no_grad():\n",
428
+ " outputs = model.generate(**inputs, max_length=256, num_beams=5)\n",
429
+ "\n",
430
+ " # Decode\n",
431
+ " return tokenizer.batch_decode(outputs, skip_special_tokens=True)\n",
432
+ "\n",
433
+ "# Example\n",
434
+ "en_sents = [\n",
435
+ " \"When I was young, I used to go to the park every day.\",\n",
436
+ " \"He has many old books, which he inherited from his ancestors.\"\n",
437
+ "]\n",
438
+ "translations = batch_translate(en_sents, \"eng_Latn\", \"hin_Deva\")\n",
439
+ "\n",
440
+ "for src, tgt in zip(en_sents, translations):\n",
441
+ " print(f\"{src} --> {tgt}\")\n"
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "code",
446
+ "execution_count": 3,
447
+ "id": "6226efc6",
448
+ "metadata": {},
449
+ "outputs": [
450
+ {
451
+ "data": {
452
+ "application/vnd.jupyter.widget-view+json": {
453
+ "model_id": "723d9adbe3d04fa0a614614fd9e12402",
454
+ "version_major": 2,
455
+ "version_minor": 0
456
+ },
457
+ "text/plain": [
458
+ "tokenizer_config.json: 0%| | 0.00/44.0 [00:00<?, ?B/s]"
459
+ ]
460
+ },
461
+ "metadata": {},
462
+ "output_type": "display_data"
463
+ },
464
+ {
465
+ "data": {
466
+ "application/vnd.jupyter.widget-view+json": {
467
+ "model_id": "899ca69572054095939b5d5bb30ef0fa",
468
+ "version_major": 2,
469
+ "version_minor": 0
470
+ },
471
+ "text/plain": [
472
+ "source.spm: 0%| | 0.00/812k [00:00<?, ?B/s]"
473
+ ]
474
+ },
475
+ "metadata": {},
476
+ "output_type": "display_data"
477
+ },
478
+ {
479
+ "data": {
480
+ "application/vnd.jupyter.widget-view+json": {
481
+ "model_id": "828492450e724b67bad0bc91d5627377",
482
+ "version_major": 2,
483
+ "version_minor": 0
484
+ },
485
+ "text/plain": [
486
+ "target.spm: 0%| | 0.00/1.07M [00:00<?, ?B/s]"
487
+ ]
488
+ },
489
+ "metadata": {},
490
+ "output_type": "display_data"
491
+ },
492
+ {
493
+ "data": {
494
+ "application/vnd.jupyter.widget-view+json": {
495
+ "model_id": "0c8a6aa4773a427d98c7daf6807c84cb",
496
+ "version_major": 2,
497
+ "version_minor": 0
498
+ },
499
+ "text/plain": [
500
+ "vocab.json: 0.00B [00:00, ?B/s]"
501
+ ]
502
+ },
503
+ "metadata": {},
504
+ "output_type": "display_data"
505
+ },
506
+ {
507
+ "data": {
508
+ "application/vnd.jupyter.widget-view+json": {
509
+ "model_id": "e9d7c0c795034dce94b2ab1846ed91bc",
510
+ "version_major": 2,
511
+ "version_minor": 0
512
+ },
513
+ "text/plain": [
514
+ "config.json: 0.00B [00:00, ?B/s]"
515
+ ]
516
+ },
517
+ "metadata": {},
518
+ "output_type": "display_data"
519
+ },
520
+ {
521
+ "data": {
522
+ "application/vnd.jupyter.widget-view+json": {
523
+ "model_id": "29eb15e6882b41afad2f0a6bb1dbfea3",
524
+ "version_major": 2,
525
+ "version_minor": 0
526
+ },
527
+ "text/plain": [
528
+ "pytorch_model.bin: 0%| | 0.00/306M [00:00<?, ?B/s]"
529
+ ]
530
+ },
531
+ "metadata": {},
532
+ "output_type": "display_data"
533
+ },
534
+ {
535
+ "ename": "OSError",
536
+ "evalue": "[Errno 28] No space left on device",
537
+ "output_type": "error",
538
+ "traceback": [
539
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
540
+ "\u001b[31mOSError\u001b[39m Traceback (most recent call last)",
541
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 4\u001b[39m model_name = \u001b[33m\"\u001b[39m\u001b[33mHelsinki-NLP/opus-mt-en-hi\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 5\u001b[39m tokenizer = MarianTokenizer.from_pretrained(model_name)\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m model = \u001b[43mMarianMTModel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m)\u001b[49m.to(\u001b[33m\"\u001b[39m\u001b[33mcuda\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m torch.cuda.is_available() \u001b[38;5;28;01melse\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mcpu\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mtranslate\u001b[39m(texts):\n\u001b[32m 9\u001b[39m inputs = tokenizer(texts, return_tensors=\u001b[33m\"\u001b[39m\u001b[33mpt\u001b[39m\u001b[33m\"\u001b[39m, padding=\u001b[38;5;28;01mTrue\u001b[39;00m, truncation=\u001b[38;5;28;01mTrue\u001b[39;00m).to(model.device)\n",
542
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\modeling_utils.py:317\u001b[39m, in \u001b[36mrestore_default_torch_dtype.<locals>._wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 315\u001b[39m old_dtype = torch.get_default_dtype()\n\u001b[32m 316\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m317\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 318\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 319\u001b[39m torch.set_default_dtype(old_dtype)\n",
543
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\modeling_utils.py:4923\u001b[39m, in \u001b[36mPreTrainedModel.from_pretrained\u001b[39m\u001b[34m(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)\u001b[39m\n\u001b[32m 4913\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[32m 4914\u001b[39m gguf_file\n\u001b[32m 4915\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m device_map \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 4916\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m ((\u001b[38;5;28misinstance\u001b[39m(device_map, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mdisk\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m device_map.values()) \u001b[38;5;129;01mor\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mdisk\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m device_map)\n\u001b[32m 4917\u001b[39m ):\n\u001b[32m 4918\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[32m 4919\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mOne or more modules is configured to be mapped to disk. Disk offload is not supported for models \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 4920\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mloaded from GGUF files.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 4921\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m4923\u001b[39m checkpoint_files, sharded_metadata = \u001b[43m_get_resolved_checkpoint_files\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 4924\u001b[39m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4925\u001b[39m \u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m=\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4926\u001b[39m \u001b[43m \u001b[49m\u001b[43mvariant\u001b[49m\u001b[43m=\u001b[49m\u001b[43mvariant\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4927\u001b[39m \u001b[43m \u001b[49m\u001b[43mgguf_file\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgguf_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4928\u001b[39m \u001b[43m \u001b[49m\u001b[43mfrom_tf\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfrom_tf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4929\u001b[39m \u001b[43m \u001b[49m\u001b[43mfrom_flax\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfrom_flax\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4930\u001b[39m \u001b[43m \u001b[49m\u001b[43muse_safetensors\u001b[49m\u001b[43m=\u001b[49m\u001b[43muse_safetensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4931\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4932\u001b[39m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4933\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4934\u001b[39m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4935\u001b[39m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4936\u001b[39m \u001b[43m \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m=\u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4937\u001b[39m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4938\u001b[39m \u001b[43m \u001b[49m\u001b[43mcommit_hash\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcommit_hash\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4939\u001b[39m \u001b[43m \u001b[49m\u001b[43mis_remote_code\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_auto_class\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 4940\u001b[39m \u001b[43m \u001b[49m\u001b[43mtransformers_explicit_filename\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtransformers_explicit_filename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 4941\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4943\u001b[39m is_sharded = sharded_metadata \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 4944\u001b[39m is_quantized = hf_quantizer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n",
544
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\modeling_utils.py:1208\u001b[39m, in \u001b[36m_get_resolved_checkpoint_files\u001b[39m\u001b[34m(pretrained_model_name_or_path, subfolder, variant, gguf_file, from_tf, from_flax, use_safetensors, cache_dir, force_download, proxies, local_files_only, token, user_agent, revision, commit_hash, is_remote_code, transformers_explicit_filename)\u001b[39m\n\u001b[32m 1205\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1206\u001b[39m \u001b[38;5;66;03m# This repo has no safetensors file of any kind, we switch to PyTorch.\u001b[39;00m\n\u001b[32m 1207\u001b[39m filename = _add_variant(WEIGHTS_NAME, variant)\n\u001b[32m-> \u001b[39m\u001b[32m1208\u001b[39m resolved_archive_file = \u001b[43mcached_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1209\u001b[39m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mcached_file_kwargs\u001b[49m\n\u001b[32m 1210\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1211\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m resolved_archive_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m filename == _add_variant(WEIGHTS_NAME, variant):\n\u001b[32m 1212\u001b[39m \u001b[38;5;66;03m# Maybe the checkpoint is sharded, we try to grab the index name in this case.\u001b[39;00m\n\u001b[32m 1213\u001b[39m resolved_archive_file = cached_file(\n\u001b[32m 1214\u001b[39m pretrained_model_name_or_path,\n\u001b[32m 1215\u001b[39m _add_variant(WEIGHTS_INDEX_NAME, variant),\n\u001b[32m 1216\u001b[39m **cached_file_kwargs,\n\u001b[32m 1217\u001b[39m )\n",
545
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\utils\\hub.py:321\u001b[39m, in \u001b[36mcached_file\u001b[39m\u001b[34m(path_or_repo_id, filename, **kwargs)\u001b[39m\n\u001b[32m 263\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcached_file\u001b[39m(\n\u001b[32m 264\u001b[39m path_or_repo_id: Union[\u001b[38;5;28mstr\u001b[39m, os.PathLike],\n\u001b[32m 265\u001b[39m filename: \u001b[38;5;28mstr\u001b[39m,\n\u001b[32m 266\u001b[39m **kwargs,\n\u001b[32m 267\u001b[39m ) -> Optional[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[32m 268\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 269\u001b[39m \u001b[33;03m Tries to locate a file in a local folder and repo, downloads and cache it if necessary.\u001b[39;00m\n\u001b[32m 270\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 319\u001b[39m \u001b[33;03m ```\u001b[39;00m\n\u001b[32m 320\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m321\u001b[39m file = \u001b[43mcached_files\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilenames\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 322\u001b[39m file = file[\u001b[32m0\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m file\n\u001b[32m 323\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m file\n",
546
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\utils\\hub.py:567\u001b[39m, in \u001b[36mcached_files\u001b[39m\u001b[34m(path_or_repo_id, filenames, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[39m\n\u001b[32m 564\u001b[39m \u001b[38;5;66;03m# Any other Exception type should now be re-raised, in order to provide helpful error messages and break the execution flow\u001b[39;00m\n\u001b[32m 565\u001b[39m \u001b[38;5;66;03m# (EntryNotFoundError will be treated outside this block and correctly re-raised if needed)\u001b[39;00m\n\u001b[32m 566\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e, EntryNotFoundError):\n\u001b[32m--> \u001b[39m\u001b[32m567\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[32m 569\u001b[39m resolved_files = [\n\u001b[32m 570\u001b[39m _get_cache_file_to_return(path_or_repo_id, filename, cache_dir, revision) \u001b[38;5;28;01mfor\u001b[39;00m filename \u001b[38;5;129;01min\u001b[39;00m full_filenames\n\u001b[32m 571\u001b[39m ]\n\u001b[32m 572\u001b[39m \u001b[38;5;66;03m# If there are any missing file and the flag is active, raise\u001b[39;00m\n",
547
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\transformers\\utils\\hub.py:479\u001b[39m, in \u001b[36mcached_files\u001b[39m\u001b[34m(path_or_repo_id, filenames, cache_dir, force_download, resume_download, proxies, token, revision, local_files_only, subfolder, repo_type, user_agent, _raise_exceptions_for_gated_repo, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash, **deprecated_kwargs)\u001b[39m\n\u001b[32m 476\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 477\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(full_filenames) == \u001b[32m1\u001b[39m:\n\u001b[32m 478\u001b[39m \u001b[38;5;66;03m# This is slightly better for only 1 file\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m479\u001b[39m \u001b[43mhf_hub_download\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 480\u001b[39m \u001b[43m \u001b[49m\u001b[43mpath_or_repo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 481\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilenames\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 482\u001b[39m \u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[43m==\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msubfolder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 483\u001b[39m \u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 484\u001b[39m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 485\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 486\u001b[39m \u001b[43m \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m=\u001b[49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 487\u001b[39m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 488\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 489\u001b[39m \u001b[43m \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 490\u001b[39m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 491\u001b[39m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 492\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 493\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 494\u001b[39m snapshot_download(\n\u001b[32m 495\u001b[39m path_or_repo_id,\n\u001b[32m 496\u001b[39m allow_patterns=full_filenames,\n\u001b[32m (...)\u001b[39m\u001b[32m 505\u001b[39m local_files_only=local_files_only,\n\u001b[32m 506\u001b[39m )\n",
548
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\huggingface_hub\\utils\\_validators.py:114\u001b[39m, in \u001b[36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 111\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[32m 112\u001b[39m kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.\u001b[34m__name__\u001b[39m, has_token=has_token, kwargs=kwargs)\n\u001b[32m--> \u001b[39m\u001b[32m114\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
549
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:1010\u001b[39m, in \u001b[36mhf_hub_download\u001b[39m\u001b[34m(repo_id, filename, subfolder, repo_type, revision, library_name, library_version, cache_dir, local_dir, user_agent, force_download, proxies, etag_timeout, token, local_files_only, headers, endpoint, resume_download, force_filename, local_dir_use_symlinks)\u001b[39m\n\u001b[32m 990\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m _hf_hub_download_to_local_dir(\n\u001b[32m 991\u001b[39m \u001b[38;5;66;03m# Destination\u001b[39;00m\n\u001b[32m 992\u001b[39m local_dir=local_dir,\n\u001b[32m (...)\u001b[39m\u001b[32m 1007\u001b[39m local_files_only=local_files_only,\n\u001b[32m 1008\u001b[39m )\n\u001b[32m 1009\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1010\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_hf_hub_download_to_cache_dir\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1011\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Destination\u001b[39;49;00m\n\u001b[32m 1012\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1013\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# File info\u001b[39;49;00m\n\u001b[32m 1014\u001b[39m \u001b[43m \u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1015\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1016\u001b[39m \u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1017\u001b[39m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1018\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# HTTP info\u001b[39;49;00m\n\u001b[32m 1019\u001b[39m \u001b[43m \u001b[49m\u001b[43mendpoint\u001b[49m\u001b[43m=\u001b[49m\u001b[43mendpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1020\u001b[39m \u001b[43m \u001b[49m\u001b[43metag_timeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43metag_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1021\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhf_headers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1022\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1023\u001b[39m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1024\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Additional options\u001b[39;49;00m\n\u001b[32m 1025\u001b[39m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1026\u001b[39m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1027\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
550
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:1171\u001b[39m, in \u001b[36m_hf_hub_download_to_cache_dir\u001b[39m\u001b[34m(cache_dir, repo_id, filename, repo_type, revision, endpoint, etag_timeout, headers, proxies, token, local_files_only, force_download)\u001b[39m\n\u001b[32m 1168\u001b[39m \u001b[38;5;66;03m# Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)\u001b[39;00m\n\u001b[32m 1170\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m WeakFileLock(lock_path):\n\u001b[32m-> \u001b[39m\u001b[32m1171\u001b[39m \u001b[43m_download_to_tmp_and_move\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1172\u001b[39m \u001b[43m \u001b[49m\u001b[43mincomplete_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblob_path\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m.incomplete\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1173\u001b[39m \u001b[43m \u001b[49m\u001b[43mdestination_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblob_path\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1174\u001b[39m \u001b[43m \u001b[49m\u001b[43murl_to_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43murl_to_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1175\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1176\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1177\u001b[39m \u001b[43m \u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1178\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1179\u001b[39m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1180\u001b[39m \u001b[43m \u001b[49m\u001b[43metag\u001b[49m\u001b[43m=\u001b[49m\u001b[43metag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1181\u001b[39m \u001b[43m \u001b[49m\u001b[43mxet_file_data\u001b[49m\u001b[43m=\u001b[49m\u001b[43mxet_file_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1182\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1183\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m os.path.exists(pointer_path):\n\u001b[32m 1184\u001b[39m _create_symlink(blob_path, pointer_path, new_blob=\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
551
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:1738\u001b[39m, in \u001b[36m_download_to_tmp_and_move\u001b[39m\u001b[34m(incomplete_path, destination_path, url_to_download, proxies, headers, expected_size, filename, force_download, etag, xet_file_data)\u001b[39m\n\u001b[32m 1731\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m xet_file_data \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m constants.HF_HUB_DISABLE_XET:\n\u001b[32m 1732\u001b[39m logger.warning(\n\u001b[32m 1733\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mXet Storage is enabled for this repo, but the \u001b[39m\u001b[33m'\u001b[39m\u001b[33mhf_xet\u001b[39m\u001b[33m'\u001b[39m\u001b[33m package is not installed. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1734\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mFalling back to regular HTTP download. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1735\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mFor better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 1736\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m1738\u001b[39m \u001b[43mhttp_get\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1739\u001b[39m \u001b[43m \u001b[49m\u001b[43murl_to_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1740\u001b[39m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1741\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1742\u001b[39m \u001b[43m \u001b[49m\u001b[43mresume_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mresume_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1743\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1744\u001b[39m \u001b[43m \u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1745\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1747\u001b[39m logger.info(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mDownload complete. Moving file to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdestination_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 1748\u001b[39m _chmod_and_move(incomplete_path, destination_path)\n",
552
+ "\u001b[36mFile \u001b[39m\u001b[32md:\\Major Project\\Chathur\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:499\u001b[39m, in \u001b[36mhttp_get\u001b[39m\u001b[34m(url, temp_file, proxies, resume_size, headers, expected_size, displayed_filename, _nb_retries, _tqdm_bar)\u001b[39m\n\u001b[32m 497\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m chunk: \u001b[38;5;66;03m# filter out keep-alive new chunks\u001b[39;00m\n\u001b[32m 498\u001b[39m progress.update(\u001b[38;5;28mlen\u001b[39m(chunk))\n\u001b[32m--> \u001b[39m\u001b[32m499\u001b[39m \u001b[43mtemp_file\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 500\u001b[39m new_resume_size += \u001b[38;5;28mlen\u001b[39m(chunk)\n\u001b[32m 501\u001b[39m \u001b[38;5;66;03m# Some data has been downloaded from the server so we reset the number of retries.\u001b[39;00m\n",
553
+ "\u001b[31mOSError\u001b[39m: [Errno 28] No space left on device"
554
+ ]
555
+ }
556
+ ],
557
+ "source": [
558
+ "from transformers import MarianMTModel, MarianTokenizer\n",
559
+ "import torch\n",
560
+ "\n",
561
+ "model_name = \"Helsinki-NLP/opus-mt-en-hi\"\n",
562
+ "tokenizer = MarianTokenizer.from_pretrained(model_name)\n",
563
+ "model = MarianMTModel.from_pretrained(model_name).to(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
564
+ "\n",
565
+ "def translate(texts):\n",
566
+ " inputs = tokenizer(texts, return_tensors=\"pt\", padding=True, truncation=True).to(model.device)\n",
567
+ " translated = model.generate(**inputs, max_length=256)\n",
568
+ " return tokenizer.batch_decode(translated, skip_special_tokens=True)\n",
569
+ "\n",
570
+ "sentences = [\n",
571
+ " \"I love Indian food.\",\n",
572
+ " \"My friend is visiting Delhi tomorrow.\",\n",
573
+ " \"The weather is very pleasant today.\"\n",
574
+ "]\n",
575
+ "print(translate(sentences))\n"
576
+ ]
577
+ },
578
+ {
579
+ "cell_type": "code",
580
+ "execution_count": 1,
581
+ "id": "0166a4f1",
582
+ "metadata": {},
583
+ "outputs": [
584
+ {
585
+ "name": "stdout",
586
+ "output_type": "stream",
587
+ "text": [
588
+ "Files removed: 0 (0 bytes)\n",
589
+ "Note: you may need to restart the kernel to use updated packages.\n"
590
+ ]
591
+ },
592
+ {
593
+ "name": "stderr",
594
+ "output_type": "stream",
595
+ "text": [
596
+ "WARNING: No matching packages\n"
597
+ ]
598
+ }
599
+ ],
600
+ "source": [
601
+ "%pip cache purge\n"
602
+ ]
603
+ },
604
+ {
605
+ "cell_type": "code",
606
+ "execution_count": null,
607
+ "id": "d9218bcd",
608
+ "metadata": {},
609
+ "outputs": [],
610
+ "source": []
611
+ }
612
+ ],
613
+ "metadata": {
614
+ "kernelspec": {
615
+ "display_name": ".venv",
616
+ "language": "python",
617
+ "name": "python3"
618
+ },
619
+ "language_info": {
620
+ "codemirror_mode": {
621
+ "name": "ipython",
622
+ "version": 3
623
+ },
624
+ "file_extension": ".py",
625
+ "mimetype": "text/x-python",
626
+ "name": "python",
627
+ "nbconvert_exporter": "python",
628
+ "pygments_lexer": "ipython3",
629
+ "version": "3.13.7"
630
+ }
631
+ },
632
+ "nbformat": 4,
633
+ "nbformat_minor": 5
634
+ }
api/rag/translator.ipynb ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "1243db1a",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "!pip install --upgrade torch --index-url https://download.pytorch.org/whl/cu121 --quiet\n",
11
+ "!pip install --upgrade transformers datasets sentencepiece sacrebleu evaluate --quiet\n"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "markdown",
16
+ "id": "03d54e16",
17
+ "metadata": {},
18
+ "source": [
19
+ "### Install dependencies and upgrade PyTorch"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "id": "b1593c50",
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "import os\n",
30
+ "os.environ[\"WANDB_DISABLED\"] = \"true\" # Disable WandB logging completely\n",
31
+ "\n",
32
+ "import torch\n",
33
+ "from datasets import load_dataset, concatenate_datasets\n",
34
+ "import evaluate\n",
35
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "markdown",
40
+ "id": "ec6c67e5",
41
+ "metadata": {},
42
+ "source": [
43
+ "### 1. Set device for GPU if available"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": null,
49
+ "id": "3aeb9062",
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
54
+ "print(\"Using device:\", device)"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "markdown",
59
+ "id": "8fe4ab2b",
60
+ "metadata": {},
61
+ "source": [
62
+ "### 2. Load English–Kannada and English–Hindi datasets"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": null,
68
+ "id": "4a2d41a9",
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "print(\"Loading datasets...\")\n",
73
+ "dataset_kn = load_dataset(\"ai4bharat/samanantar\", \"kn\", split=\"train\")\n",
74
+ "dataset_hi = load_dataset(\"ai4bharat/samanantar\", \"hi\", split=\"train\")\n",
75
+ "\n",
76
+ "print(\"English–Kannada sample:\", dataset_kn[0])\n",
77
+ "print(\"English–Hindi sample:\", dataset_hi[0])"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "markdown",
82
+ "id": "dac2a390",
83
+ "metadata": {},
84
+ "source": [
85
+ "### 3. Reduce dataset size for faster local training"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "id": "43ab2de9",
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "max_samples = 50000 # Adjust this number based on your system performance\n",
96
+ "if len(dataset_kn) > max_samples:\n",
97
+ " dataset_kn = dataset_kn.select(range(max_samples))\n",
98
+ "if len(dataset_hi) > max_samples:\n",
99
+ " dataset_hi = dataset_hi.select(range(max_samples))"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "markdown",
104
+ "id": "4c1e651d",
105
+ "metadata": {},
106
+ "source": [
107
+ "### 4. Merge datasets"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": null,
113
+ "id": "2ac937ec",
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "dataset = concatenate_datasets([dataset_kn, dataset_hi])\n",
118
+ "print(\"Combined dataset size:\", len(dataset))"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "markdown",
123
+ "id": "8c3252bf",
124
+ "metadata": {},
125
+ "source": [
126
+ "### 5. Load tokenizer and model (mT5-small with safetensors)"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": null,
132
+ "id": "ebd71460",
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "\n",
137
+ "model_checkpoint = \"google/mt5-small\"\n",
138
+ "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
139
+ "model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, trust_remote_code=True, use_safetensors=True)\n",
140
+ "model.to(device) # Move model to GPU if available\n",
141
+ "\n",
142
+ "# ================================================\n",
143
+ "# 6. Preprocess data\n",
144
+ "# ================================================\n",
145
+ "max_len = 128\n",
146
+ "\n",
147
+ "def preprocess_function(examples):\n",
148
+ " inputs = examples[\"src\"] # English text\n",
149
+ " targets = examples[\"tgt\"] # Kannada or Hindi text\n",
150
+ " model_inputs = tokenizer(inputs, truncation=True, padding=\"max_length\", max_length=max_len)\n",
151
+ " with tokenizer.as_target_tokenizer():\n",
152
+ " labels = tokenizer(targets, truncation=True, padding=\"max_length\", max_length=max_len)\n",
153
+ " model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
154
+ " return model_inputs\n",
155
+ "\n",
156
+ "print(\"Tokenizing dataset...\")\n",
157
+ "tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"idx\", \"src\", \"tgt\"])\n",
158
+ "\n",
159
+ "# ================================================\n",
160
+ "# 7. Training setup\n",
161
+ "# ================================================\n",
162
+ "metric = evaluate.load(\"sacrebleu\")\n",
163
+ "\n",
164
+ "def compute_metrics(eval_pred):\n",
165
+ " preds, labels = eval_pred\n",
166
+ " decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
167
+ " labels = [[tokenizer.decode(l, skip_special_tokens=True)] for l in labels]\n",
168
+ " result = metric.compute(predictions=decoded_preds, references=labels)\n",
169
+ " result[\"bleu\"] = result[\"score\"]\n",
170
+ " return result\n",
171
+ "\n",
172
+ "training_args = Seq2SeqTrainingArguments(\n",
173
+ " output_dir=\"./translator-model\",\n",
174
+ " do_eval=True,\n",
175
+ " per_device_train_batch_size=8,\n",
176
+ " per_device_eval_batch_size=8,\n",
177
+ " learning_rate=5e-5,\n",
178
+ " num_train_epochs=2,\n",
179
+ " weight_decay=0.01,\n",
180
+ " save_total_limit=2,\n",
181
+ " predict_with_generate=True,\n",
182
+ " logging_steps=200,\n",
183
+ " save_steps=1000,\n",
184
+ " report_to=\"none\" # no W&B logging\n",
185
+ ")\n",
186
+ "\n",
187
+ "trainer = Seq2SeqTrainer(\n",
188
+ " model=model,\n",
189
+ " args=training_args,\n",
190
+ " train_dataset=tokenized_dataset,\n",
191
+ " eval_dataset=tokenized_dataset.select(range(1000)), # small eval set\n",
192
+ " tokenizer=tokenizer,\n",
193
+ " compute_metrics=compute_metrics,\n",
194
+ ")"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "markdown",
199
+ "id": "e7d85b60",
200
+ "metadata": {},
201
+ "source": [
202
+ "### 8. Train model"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": null,
208
+ "id": "d031c154",
209
+ "metadata": {},
210
+ "outputs": [],
211
+ "source": [
212
+ "print(\"Starting training...\")\n",
213
+ "trainer.train()"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "markdown",
218
+ "id": "4679d55b",
219
+ "metadata": {},
220
+ "source": [
221
+ "### 9. Evaluate model"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type": "code",
226
+ "execution_count": null,
227
+ "id": "9d484587",
228
+ "metadata": {},
229
+ "outputs": [],
230
+ "source": [
231
+ "print(\"Running evaluation...\")\n",
232
+ "results = trainer.evaluate()\n",
233
+ "print(\"Evaluation BLEU score:\", results.get(\"bleu\", results))\n"
234
+ ]
235
+ },
236
+ {
237
+ "cell_type": "markdown",
238
+ "id": "c39f5ed2",
239
+ "metadata": {},
240
+ "source": [
241
+ "### 10. Save final model locally"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": null,
247
+ "id": "8e5f4517",
248
+ "metadata": {},
249
+ "outputs": [],
250
+ "source": [
251
+ "trainer.save_model(\"./final-translator\")\n",
252
+ "tokenizer.save_pretrained(\"./final-translator\")\n",
253
+ "print(\"Model saved in ./final-translator\")"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "markdown",
258
+ "id": "7d019483",
259
+ "metadata": {},
260
+ "source": [
261
+ "### 11. Test translation"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "markdown",
266
+ "id": "67c28c80",
267
+ "metadata": {},
268
+ "source": [
269
+ "test_sentence = \"How are you?\"\n",
270
+ "inputs = tokenizer(test_sentence, return_tensors=\"pt\", padding=True).to(device)\n",
271
+ "outputs = model.generate(**inputs, max_length=50)\n",
272
+ "print(\"Input:\", test_sentence)\n",
273
+ "print(\"Translated:\", tokenizer.decode(outputs[0], skip_special_tokens=True))"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": 1,
279
+ "id": "d5daee8c",
280
+ "metadata": {},
281
+ "outputs": [
282
+ {
283
+ "ename": "ModuleNotFoundError",
284
+ "evalue": "No module named 'torch'",
285
+ "output_type": "error",
286
+ "traceback": [
287
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
288
+ "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
289
+ "Cell \u001b[1;32mIn[1], line 11\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# ==========================================================\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# 0. Install dependencies (run in terminal once)\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# ==========================================================\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;66;03m# 1. Imports and setup\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;66;03m# ==========================================================\u001b[39;00m\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m---> 11\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mdatasets\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m load_dataset, concatenate_datasets\n\u001b[0;32m 13\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mevaluate\u001b[39;00m\n",
290
+ "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'torch'"
291
+ ]
292
+ }
293
+ ],
294
+ "source": [
295
+ "# ==========================================================\n",
296
+ "# 0. Install dependencies (run in terminal once)\n",
297
+ "# ==========================================================\n",
298
+ "# pip install torch==2.5.1+cu121 torchvision==0.12.1+cu121 torchaudio==2.5.1+cu121 --index-url https://download.pytorch.org/whl/cu121\n",
299
+ "# pip install --upgrade transformers datasets sentencepiece sacrebleu evaluate peft --quiet\n",
300
+ "\n",
301
+ "# ==========================================================\n",
302
+ "# 1. Imports and setup\n",
303
+ "# ==========================================================\n",
304
+ "import os\n",
305
+ "import torch\n",
306
+ "from datasets import load_dataset, concatenate_datasets\n",
307
+ "import evaluate\n",
308
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer\n",
309
+ "\n",
310
+ "# PEFT for LoRA\n",
311
+ "from peft import LoraConfig, get_peft_model\n",
312
+ "\n",
313
+ "# Disable wandb\n",
314
+ "os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
315
+ "\n",
316
+ "# Use GPU if available\n",
317
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
318
+ "print(\"Using device:\", device)\n",
319
+ "\n",
320
+ "# ==========================================================\n",
321
+ "# 2. Load datasets\n",
322
+ "# ==========================================================\n",
323
+ "print(\"Loading datasets...\")\n",
324
+ "dataset_kn = load_dataset(\"ai4bharat/samanantar\", \"kn\", split=\"train\")\n",
325
+ "dataset_hi = load_dataset(\"ai4bharat/samanantar\", \"hi\", split=\"train\")\n",
326
+ "\n",
327
+ "# Optional: reduce dataset size for quick local training\n",
328
+ "max_samples = 5000 # adjust depending on your GPU memory\n",
329
+ "dataset_kn = dataset_kn.select(range(min(len(dataset_kn), max_samples)))\n",
330
+ "dataset_hi = dataset_hi.select(range(min(len(dataset_hi), max_samples)))\n",
331
+ "\n",
332
+ "# Merge datasets\n",
333
+ "dataset = concatenate_datasets([dataset_kn, dataset_hi])\n",
334
+ "print(\"Combined dataset size:\", len(dataset))\n",
335
+ "\n",
336
+ "# ==========================================================\n",
337
+ "# 3. Load tokenizer and model (mt5-tiny for low VRAM)\n",
338
+ "# ==========================================================\n",
339
+ "model_checkpoint = \"google/mt5-tiny\"\n",
340
+ "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
341
+ "model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, use_safetensors=True)\n",
342
+ "model.to(device)\n",
343
+ "\n",
344
+ "# ==========================================================\n",
345
+ "# 4. LoRA config (parameter-efficient fine-tuning)\n",
346
+ "# ==========================================================\n",
347
+ "lora_config = LoraConfig(\n",
348
+ " r=8,\n",
349
+ " lora_alpha=16,\n",
350
+ " target_modules=[\"q\", \"v\"], # applies LoRA to Q and V matrices\n",
351
+ " lora_dropout=0.05,\n",
352
+ " bias=\"none\",\n",
353
+ " task_type=\"SEQ_2_SEQ_LM\"\n",
354
+ ")\n",
355
+ "model = get_peft_model(model, lora_config)\n",
356
+ "print(\"LoRA applied for low-memory fine-tuning.\")\n",
357
+ "\n",
358
+ "# ==========================================================\n",
359
+ "# 5. Preprocessing\n",
360
+ "# ==========================================================\n",
361
+ "max_len = 128\n",
362
+ "\n",
363
+ "def preprocess_function(examples):\n",
364
+ " inputs = examples[\"src\"] # English\n",
365
+ " targets = examples[\"tgt\"] # Kannada or Hindi\n",
366
+ " model_inputs = tokenizer(inputs, truncation=True, padding=\"max_length\", max_length=max_len)\n",
367
+ " with tokenizer.as_target_tokenizer():\n",
368
+ " labels = tokenizer(targets, truncation=True, padding=\"max_length\", max_length=max_len)\n",
369
+ " model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
370
+ " return model_inputs\n",
371
+ "\n",
372
+ "print(\"Tokenizing dataset...\")\n",
373
+ "tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=[\"idx\", \"src\", \"tgt\"])\n",
374
+ "\n",
375
+ "# ==========================================================\n",
376
+ "# 6. Evaluation metric\n",
377
+ "# ==========================================================\n",
378
+ "metric = evaluate.load(\"sacrebleu\")\n",
379
+ "\n",
380
+ "def compute_metrics(eval_pred):\n",
381
+ " preds, labels = eval_pred\n",
382
+ " decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
383
+ " labels = [[tokenizer.decode(l, skip_special_tokens=True)] for l in labels]\n",
384
+ " result = metric.compute(predictions=decoded_preds, references=labels)\n",
385
+ " result[\"bleu\"] = result[\"score\"]\n",
386
+ " return result\n",
387
+ "\n",
388
+ "# ==========================================================\n",
389
+ "# 7. Training setup\n",
390
+ "# ==========================================================\n",
391
+ "training_args = Seq2SeqTrainingArguments(\n",
392
+ " output_dir=\"./translator-model\",\n",
393
+ " do_train=True,\n",
394
+ " do_eval=True,\n",
395
+ " per_device_train_batch_size=4, # reduce if out-of-memory\n",
396
+ " per_device_eval_batch_size=4,\n",
397
+ " learning_rate=5e-5,\n",
398
+ " num_train_epochs=2,\n",
399
+ " weight_decay=0.01,\n",
400
+ " save_total_limit=2,\n",
401
+ " predict_with_generate=True,\n",
402
+ " logging_steps=50,\n",
403
+ " save_steps=500,\n",
404
+ " report_to=\"none\"\n",
405
+ ")\n",
406
+ "\n",
407
+ "trainer = Seq2SeqTrainer(\n",
408
+ " model=model,\n",
409
+ " args=training_args,\n",
410
+ " train_dataset=tokenized_dataset,\n",
411
+ " eval_dataset=tokenized_dataset.select(range(min(500, len(tokenized_dataset)))), # small eval\n",
412
+ " tokenizer=tokenizer,\n",
413
+ " compute_metrics=compute_metrics,\n",
414
+ ")\n",
415
+ "\n",
416
+ "\n",
417
+ "\n",
418
+ "# ==========================================================\n",
419
+ "# 11. Test translation\n",
420
+ "# ==========================================================\n",
421
+ "test_sentence = \"How are you?\"\n",
422
+ "inputs = tokenizer(test_sentence, return_tensors=\"pt\", padding=True).to(device)\n",
423
+ "outputs = model.generate(**inputs, max_length=50)\n",
424
+ "print(\"Input:\", test_sentence)\n",
425
+ "print(\"Translated:\", tokenizer.decode(outputs[0], skip_special_tokens=True))\n"
426
+ ]
427
+ },
428
+ {
429
+ "cell_type": "code",
430
+ "execution_count": null,
431
+ "id": "50378dce",
432
+ "metadata": {},
433
+ "outputs": [],
434
+ "source": [
435
+ "# ==========================================================\n",
436
+ "# 8. Train the model\n",
437
+ "# ==========================================================\n",
438
+ "print(\"Starting training...\")\n",
439
+ "trainer.train()\n",
440
+ "\n",
441
+ "# ==========================================================\n",
442
+ "# 9. Evaluate\n",
443
+ "# ==========================================================\n",
444
+ "print(\"Running evaluation...\")\n",
445
+ "results = trainer.evaluate()\n",
446
+ "print(\"Evaluation BLEU score:\", results.get(\"bleu\", results))\n",
447
+ "\n"
448
+ ]
449
+ },
450
+ {
451
+ "cell_type": "code",
452
+ "execution_count": null,
453
+ "id": "2b11c06e",
454
+ "metadata": {},
455
+ "outputs": [],
456
+ "source": [
457
+ "# ==========================================================\n",
458
+ "# 10. Save final model\n",
459
+ "# ==========================================================\n",
460
+ "trainer.save_model(\"./final-translator\")\n",
461
+ "tokenizer.save_pretrained(\"./final-translator\")\n",
462
+ "print(\"Model saved in ./final-translator\")"
463
+ ]
464
+ },
465
+ {
466
+ "cell_type": "code",
467
+ "execution_count": 2,
468
+ "id": "5112bac7",
469
+ "metadata": {},
470
+ "outputs": [
471
+ {
472
+ "name": "stdout",
473
+ "output_type": "stream",
474
+ "text": [
475
+ "Looking in indexes: https://download.pytorch.org/whl/cu121\n",
476
+ "Collecting torch==2.5.1+cu121\n",
477
+ " Using cached https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp39-cp39-win_amd64.whl (2449.3 MB)\n",
478
+ "Note: you may need to restart the kernel to use updated packages.\n"
479
+ ]
480
+ },
481
+ {
482
+ "name": "stderr",
483
+ "output_type": "stream",
484
+ "text": [
485
+ "ERROR: Could not find a version that satisfies the requirement torchvision==0.12.1+cu121 (from versions: 0.1.6, 0.2.0, 0.16.0+cu121, 0.16.1+cu121, 0.16.2+cu121, 0.17.0+cu121, 0.17.1+cu121, 0.17.2+cu121, 0.18.0+cu121, 0.18.1+cu121, 0.19.0+cu121, 0.19.1+cu121, 0.20.0+cu121, 0.20.1+cu121)\n",
486
+ "ERROR: No matching distribution found for torchvision==0.12.1+cu121\n"
487
+ ]
488
+ }
489
+ ],
490
+ "source": [
491
+ "%pip install torch==2.5.1+cu121 torchvision==0.12.1+cu121 torchaudio==2.5.1+cu121 --index-url https://download.pytorch.org/whl/cu121\n"
492
+ ]
493
+ },
494
+ {
495
+ "cell_type": "code",
496
+ "execution_count": null,
497
+ "id": "37e4d148",
498
+ "metadata": {},
499
+ "outputs": [],
500
+ "source": []
501
+ }
502
+ ],
503
+ "metadata": {
504
+ "kernelspec": {
505
+ "display_name": "chathur",
506
+ "language": "python",
507
+ "name": "python3"
508
+ },
509
+ "language_info": {
510
+ "codemirror_mode": {
511
+ "name": "ipython",
512
+ "version": 3
513
+ },
514
+ "file_extension": ".py",
515
+ "mimetype": "text/x-python",
516
+ "name": "python",
517
+ "nbconvert_exporter": "python",
518
+ "pygments_lexer": "ipython3",
519
+ "version": "3.9.13"
520
+ }
521
+ },
522
+ "nbformat": 4,
523
+ "nbformat_minor": 5
524
+ }
api/routes/endpoints.py CHANGED
@@ -1,12 +1,21 @@
1
- from fastapi import APIRouter, HTTPException, Query, status
2
  import urllib.parse
3
- from api.services.scheme_service import get_all_schemes, get_schemes_by_state, get_scheme_details_by_title, search_schemes_in_cache, get_cache_loading_status # Changed import paths
4
- # from vector_ops import search_scheme # Assuming vector_ops is still a top-level file or imported correctly
 
 
 
 
 
5
 
6
  router = APIRouter()
7
 
8
- @router.get("/schemes", summary="Get all schemes grouped by state")
9
- def get_all_schemes_grouped_by_state_endpoint():
 
 
 
 
10
  """
11
  Returns all schemes grouped by state from the in-memory cache.
12
  """
@@ -16,14 +25,17 @@ def get_all_schemes_grouped_by_state_endpoint():
16
  detail="Schemes cache is currently loading. Please try again shortly."
17
  )
18
 
19
- schemes = get_all_schemes()
20
  if not schemes:
21
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="No schemes found in cache.")
22
  return schemes
23
 
24
 
25
- @router.get("/schemes/{state}", summary="Get schemes for a specific state")
26
- def get_scheme_titles_by_state_endpoint(state: str):
 
 
 
27
  """
28
  Returns all schemes for a specific state from the in-memory cache.
29
  """
@@ -33,7 +45,7 @@ def get_scheme_titles_by_state_endpoint(state: str):
33
  detail="Schemes cache is currently loading. Please try again shortly."
34
  )
35
 
36
- schemes_for_state = get_schemes_by_state(state)
37
  if not schemes_for_state:
38
  raise HTTPException(
39
  status_code=status.HTTP_404_NOT_FOUND,
@@ -46,8 +58,12 @@ def get_scheme_titles_by_state_endpoint(state: str):
46
  }
47
 
48
 
49
- @router.get("/schemes/{state}/scheme_titles/{title}", summary="Get details for a single scheme by title")
50
- def get_scheme_details_endpoint(state: str, title: str):
 
 
 
 
51
  """
52
  Returns details for a single scheme by title within a specific state from the in-memory cache.
53
  """
@@ -58,7 +74,7 @@ def get_scheme_details_endpoint(state: str, title: str):
58
  )
59
 
60
  decoded_title = urllib.parse.unquote(title)
61
- scheme_details = get_scheme_details_by_title(state, decoded_title)
62
 
63
  if not scheme_details:
64
  raise HTTPException(
@@ -68,8 +84,11 @@ def get_scheme_details_endpoint(state: str, title: str):
68
  return scheme_details
69
 
70
 
71
- @router.get("/searchscheme", summary="Search schemes by keyword across all states")
72
- def search_schemes_endpoint(query: str = Query(..., description="Search across all schemes")):
 
 
 
73
  """
74
  Searches schemes across all states using the in-memory cache for smooth performance.
75
  """
@@ -79,22 +98,14 @@ def search_schemes_endpoint(query: str = Query(..., description="Search across a
79
  detail="Schemes cache is currently loading. Please try again shortly."
80
  )
81
 
82
- matched_schemes = search_schemes_in_cache(query)
83
 
84
  if not matched_schemes:
85
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No schemes found matching '{query}'")
86
 
87
  return {
 
88
  "query": query,
89
  "matched_count": len(matched_schemes),
90
  "results": matched_schemes
91
  }
92
-
93
-
94
- # @router.get("/semantic-search", summary="Perform semantic search on schemes")
95
- # def semantic_search_endpoint(query: str = Query(...)):
96
- # """
97
- # Performs a semantic search on schemes using an external vector_ops module.
98
- # """
99
- # results = search_scheme(query)
100
- # return {"query": query, "results": results}
 
1
+ from fastapi import APIRouter, HTTPException, Query, Path, status
2
  import urllib.parse
3
+ from api.services.scheme_service import (
4
+ get_all_schemes,
5
+ get_schemes_by_state,
6
+ get_scheme_details_by_title,
7
+ search_schemes_in_cache,
8
+ get_cache_loading_status
9
+ )
10
 
11
  router = APIRouter()
12
 
13
+ # -------------------------
14
+ # Schemes endpoints with language
15
+ # -------------------------
16
+
17
+ @router.get("/{lang}/schemes", summary="Get all schemes grouped by state")
18
+ def get_all_schemes_grouped_by_state_endpoint(lang: str = Path(..., description="Language code, e.g., en, hi")):
19
  """
20
  Returns all schemes grouped by state from the in-memory cache.
21
  """
 
25
  detail="Schemes cache is currently loading. Please try again shortly."
26
  )
27
 
28
+ schemes = get_all_schemes(lang=lang)
29
  if not schemes:
30
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="No schemes found in cache.")
31
  return schemes
32
 
33
 
34
+ @router.get("/{lang}/schemes/{state}", summary="Get schemes for a specific state")
35
+ def get_scheme_titles_by_state_endpoint(
36
+ lang: str = Path(..., description="Language code, e.g., en, hi"),
37
+ state: str = Path(..., description="State name")
38
+ ):
39
  """
40
  Returns all schemes for a specific state from the in-memory cache.
41
  """
 
45
  detail="Schemes cache is currently loading. Please try again shortly."
46
  )
47
 
48
+ schemes_for_state = get_schemes_by_state(state, lang=lang)
49
  if not schemes_for_state:
50
  raise HTTPException(
51
  status_code=status.HTTP_404_NOT_FOUND,
 
58
  }
59
 
60
 
61
+ @router.get("/{lang}/schemes/{state}/scheme_titles/{title}", summary="Get details for a single scheme by title")
62
+ def get_scheme_details_endpoint(
63
+ lang: str = Path(..., description="Language code, e.g., en, hi"),
64
+ state: str = Path(..., description="State name"),
65
+ title: str = Path(..., description="Scheme title")
66
+ ):
67
  """
68
  Returns details for a single scheme by title within a specific state from the in-memory cache.
69
  """
 
74
  )
75
 
76
  decoded_title = urllib.parse.unquote(title)
77
+ scheme_details = get_scheme_details_by_title(state, decoded_title, lang=lang)
78
 
79
  if not scheme_details:
80
  raise HTTPException(
 
84
  return scheme_details
85
 
86
 
87
+ @router.get("/{lang}/searchscheme", summary="Search schemes by keyword across all states")
88
+ def search_schemes_endpoint(
89
+ lang: str = Path(..., description="Language code, e.g., en, hi"),
90
+ query: str = Query(..., description="Search across all schemes")
91
+ ):
92
  """
93
  Searches schemes across all states using the in-memory cache for smooth performance.
94
  """
 
98
  detail="Schemes cache is currently loading. Please try again shortly."
99
  )
100
 
101
+ matched_schemes = search_schemes_in_cache(query, lang=lang)
102
 
103
  if not matched_schemes:
104
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"No schemes found matching '{query}'")
105
 
106
  return {
107
+ "lang": lang,
108
  "query": query,
109
  "matched_count": len(matched_schemes),
110
  "results": matched_schemes
111
  }
 
 
 
 
 
 
 
 
 
api/services/scheme_service.py CHANGED
@@ -1,6 +1,7 @@
1
  import asyncio
2
  import logging
3
- from api.core.firebase_utils import get_firestore_db # Changed import path
 
4
 
5
  logger = logging.getLogger(__name__)
6
 
@@ -21,7 +22,7 @@ async def load_all_schemes_into_cache():
21
  is_cache_loading = True
22
  logger.info("Starting to load all schemes into cache from Firestore...")
23
  temp_schemes_cache = {}
24
- db = get_firestore_db() # Get the initialized DB client
25
 
26
  if not db:
27
  logger.error("Firestore DB client is not available. Cannot load schemes into cache.")
@@ -29,22 +30,22 @@ async def load_all_schemes_into_cache():
29
  return
30
 
31
  try:
32
- state_docs = db.collection("schemes").stream() # Get all state documents
 
33
 
34
  for state_doc in state_docs:
35
- state_name = state_doc.id
36
- scheme_ref = db.collection("schemes").document(state_name).collection("schemes")
37
  scheme_docs = scheme_ref.stream()
38
 
39
  schemes_in_state = []
40
  for scheme_doc in scheme_docs:
41
  data = scheme_doc.to_dict()
42
- data["id"] = scheme_doc.id # Add document ID to the data
43
  schemes_in_state.append(data)
44
 
45
  temp_schemes_cache[state_name] = schemes_in_state
46
 
47
- # Atomically update the global cache after successful fetch
48
  cached_all_schemes = temp_schemes_cache
49
  logger.info(f"Cache loaded successfully. Total states: {len(cached_all_schemes)}")
50
 
@@ -54,45 +55,131 @@ async def load_all_schemes_into_cache():
54
  is_cache_loading = False
55
 
56
 
57
- def get_all_schemes():
58
- """Returns all schemes from the in-memory cache."""
59
- return cached_all_schemes
 
 
 
 
60
 
61
- def get_schemes_by_state(state: str):
62
- """Returns schemes for a specific state from the in-memory cache."""
63
- state_capitalized = state.capitalize()
64
- return cached_all_schemes.get(state_capitalized)
 
65
 
66
- def get_scheme_details_by_title(state: str, title: str):
67
- """Returns details for a single scheme by title within a specific state."""
68
- state_capitalized = state.capitalize()
69
- schemes_for_state = cached_all_schemes.get(state_capitalized)
70
- if schemes_for_state:
71
- for scheme in schemes_for_state:
72
- if scheme.get("id") == title:
73
- return scheme
74
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- def search_schemes_in_cache(query: str):
77
- """Searches schemes across all states within the in-memory cache."""
78
  search_query = query.strip().lower()
79
  matched = []
80
 
81
- logger.info(f"Starting search for query: '{search_query}' across {len(cached_all_schemes)} states.")
 
 
 
 
 
 
 
82
 
83
  for state_name, schemes in cached_all_schemes.items():
84
  for scheme in schemes:
85
- title = scheme.get("Title", "")
86
- description = scheme.get("Description", "")
87
-
88
- if search_query in title.lower() or search_query in description.lower():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  result = scheme.copy()
90
  result["state"] = state_name
91
  matched.append(result)
 
92
 
93
  logger.info(f"Search for '{query}' completed. Found {len(matched)} matches.")
94
  return matched
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  def get_cache_loading_status():
97
  """Returns the current loading status of the cache."""
98
- return is_cache_loading
 
1
  import asyncio
2
  import logging
3
+ from difflib import SequenceMatcher
4
+ from api.core.firebase_utils import get_firestore_db
5
 
6
  logger = logging.getLogger(__name__)
7
 
 
22
  is_cache_loading = True
23
  logger.info("Starting to load all schemes into cache from Firestore...")
24
  temp_schemes_cache = {}
25
+ db = get_firestore_db()
26
 
27
  if not db:
28
  logger.error("Firestore DB client is not available. Cannot load schemes into cache.")
 
30
  return
31
 
32
  try:
33
+ # Fetch all state docs
34
+ state_docs = db.collection("schemes").stream()
35
 
36
  for state_doc in state_docs:
37
+ state_name = state_doc.id.strip().lower() # store lowercase for consistency
38
+ scheme_ref = db.collection("schemes").document(state_doc.id).collection("schemes")
39
  scheme_docs = scheme_ref.stream()
40
 
41
  schemes_in_state = []
42
  for scheme_doc in scheme_docs:
43
  data = scheme_doc.to_dict()
44
+ data["id"] = scheme_doc.id
45
  schemes_in_state.append(data)
46
 
47
  temp_schemes_cache[state_name] = schemes_in_state
48
 
 
49
  cached_all_schemes = temp_schemes_cache
50
  logger.info(f"Cache loaded successfully. Total states: {len(cached_all_schemes)}")
51
 
 
55
  is_cache_loading = False
56
 
57
 
58
+ # def get_all_schemes(lang=None):
59
+ # """Returns all schemes from the in-memory cache. If lang is provided, filter by language."""
60
+ # if not lang:
61
+ # return cached_all_schemes
62
+
63
+ # filtered_cache = {}
64
+ # for state, schemes in cached_all_schemes.items():
65
 
66
+ # filtered = [s for s in schemes if s.get("language", lang) == lang]
67
+
68
+ # if filtered:
69
+ # filtered_cache[state] = filtered
70
+ # return filtered_cache
71
 
72
+ def get_all_schemes(lang=None):
73
+ """
74
+ Returns all schemes from the in-memory cache.
75
+ If lang is provided, return all schemes that either match lang OR don't have language set.
76
+ """
77
+ if not lang:
78
+ return cached_all_schemes
79
+
80
+ filtered_cache = {}
81
+ for state, schemes in cached_all_schemes.items():
82
+ filtered = [
83
+ s for s in schemes
84
+ if not s.get("language") or s.get("language", "").lower() == lang.lower()
85
+ ]
86
+ if filtered:
87
+ filtered_cache[state] = filtered
88
+ return filtered_cache
89
+
90
+
91
+ def search_schemes_in_cache(query: str, lang: str = None):
92
+ """
93
+ Searches schemes across all states within the in-memory cache with basic stemming.
94
+ Automatically includes schemes that don't have a language field if lang is provided.
95
+ """
96
+ from difflib import SequenceMatcher
97
 
 
 
98
  search_query = query.strip().lower()
99
  matched = []
100
 
101
+ # Create variations of the query for simple stemming
102
+ search_terms = [search_query]
103
+ if search_query.endswith('ies'):
104
+ search_terms.append(search_query[:-3] + 'y')
105
+ elif search_query.endswith('s'):
106
+ search_terms.append(search_query[:-1])
107
+
108
+ logger.info(f"Starting smart search for terms: {search_terms}...")
109
 
110
  for state_name, schemes in cached_all_schemes.items():
111
  for scheme in schemes:
112
+ # Language filter: include scheme if language matches OR no language specified
113
+ language = scheme.get("language", "")
114
+ if lang and language and language.lower() != lang.lower():
115
+ continue
116
+
117
+ # Combine all searchable fields
118
+ searchable_parts = [
119
+ scheme.get("Title", ""),
120
+ scheme.get("Description", ""),
121
+ scheme.get("Tags", ""),
122
+ ]
123
+
124
+ list_fields_to_search = ["Eligibility", "Benefits", "Details", "Documents Required"]
125
+ for field in list_fields_to_search:
126
+ items = scheme.get(field, [])
127
+ if isinstance(items, list):
128
+ searchable_parts.extend(items)
129
+ elif isinstance(items, str):
130
+ searchable_parts.append(items)
131
+
132
+ searchable_text = " ".join(searchable_parts).lower()
133
+
134
+ # Check if any search term is contained or fuzzy match (for typos)
135
+ if any(term in searchable_text for term in search_terms) or \
136
+ any(SequenceMatcher(None, term, searchable_text).ratio() > 0.7 for term in search_terms):
137
  result = scheme.copy()
138
  result["state"] = state_name
139
  matched.append(result)
140
+ # Don't break; allow multiple schemes per state if needed
141
 
142
  logger.info(f"Search for '{query}' completed. Found {len(matched)} matches.")
143
  return matched
144
 
145
+ def get_schemes_by_state(state: str, lang: str = None):
146
+ """
147
+ Returns schemes for a specific state from the in-memory cache.
148
+ """
149
+ state_key = state.strip().lower()
150
+ schemes = cached_all_schemes.get(state_key)
151
+ if not schemes:
152
+ return None
153
+
154
+ if lang:
155
+ return [s for s in schemes if s.get("language", "").lower() == lang.lower()]
156
+ return schemes
157
+
158
+ def get_scheme_details_by_title(state: str, title: str, lang: str = None):
159
+ """
160
+ Returns details for a single scheme by title or id within a specific state.
161
+ """
162
+ state_key = state.strip().lower()
163
+ schemes_for_state = cached_all_schemes.get(state_key)
164
+
165
+ if not schemes_for_state:
166
+ return None
167
+
168
+ url_title_clean = title.strip().lower()
169
+
170
+ for scheme in schemes_for_state:
171
+ db_id_clean = scheme.get("id", "").strip().lower()
172
+ db_title_clean = scheme.get("Title", "").strip().lower()
173
+
174
+ if db_id_clean == url_title_clean or db_title_clean == url_title_clean:
175
+ # THIS IS THE CORRECTED LANGUAGE CHECK:
176
+ # It now correctly handles schemes that don't have a language field.
177
+ if not lang or scheme.get("language", lang).lower() == lang.lower():
178
+ return scheme
179
+
180
+ return None
181
+
182
+
183
  def get_cache_loading_status():
184
  """Returns the current loading status of the cache."""
185
+ return is_cache_loading