Dyno1307 commited on
Commit
fe5cb5f
·
verified ·
1 Parent(s): 506bae0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -26
app.py CHANGED
@@ -1,6 +1,8 @@
1
  """
2
  A FastAPI application for serving the translation model, inspired by interactive_translate.py.
3
  """
 
 
4
  import torch
5
  from transformers import M2M100ForConditionalGeneration, NllbTokenizer
6
  from fastapi import FastAPI, HTTPException, UploadFile, File
@@ -11,7 +13,10 @@ import logging
11
  from typing import List
12
  import fitz # PyMuPDF
13
  import shutil
14
- import os
 
 
 
15
 
16
  # --- 1. App Configuration ---
17
  logging.basicConfig(level=logging.INFO)
@@ -36,25 +41,30 @@ MODEL_PATH = "facebook/nllb-200-distilled-600M"
36
  model = None
37
  tokenizer = None
38
 
 
39
  # --- 3. Pydantic Models ---
40
  class TranslationRequest(BaseModel):
41
  text: str
42
  source_language: str
43
 
 
44
  class TranslationResponse(BaseModel):
45
  original_text: str
46
  translated_text: str
47
  source_language: str
48
 
 
49
  class BatchTranslationRequest(BaseModel):
50
  texts: List[str]
51
  source_language: str
52
 
 
53
  class BatchTranslationResponse(BaseModel):
54
  original_texts: List[str]
55
  translated_texts: List[str]
56
  source_language: str
57
-
 
58
  class PdfTranslationResponse(BaseModel):
59
  filename: str
60
  translated_text: str
@@ -72,13 +82,11 @@ def load_model_and_tokenizer(model_path):
72
  logger.info("Model and tokenizer loaded successfully!")
73
  except Exception as e:
74
  logger.error(f"Error loading model: {e}")
75
- # In a real app, you might want to exit or handle this more gracefully
76
  raise
77
 
 
78
  def translate_text(text: str, src_lang: str) -> str:
79
- """
80
- Translates a single string of text to English.
81
- """
82
  if src_lang not in SUPPORTED_LANGUAGES:
83
  raise ValueError(f"Language '{src_lang}' not supported.")
84
 
@@ -93,42 +101,46 @@ def translate_text(text: str, src_lang: str) -> str:
93
 
94
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
95
 
 
96
  def batch_translate_text(texts: List[str], src_lang: str) -> List[str]:
97
- """
98
- Translates a batch of texts to English.
99
- """
100
  if src_lang not in SUPPORTED_LANGUAGES:
101
  raise ValueError(f"Language '{src_lang}' not supported.")
102
 
103
  tokenizer.src_lang = SUPPORTED_LANGUAGES[src_lang]
104
- # We use padding=True to handle batches of different lengths
105
- inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
 
106
 
107
  generated_tokens = model.generate(
108
  **inputs,
109
  forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"),
110
- max_length=512, # Allow for longer generated sequences in batches
111
  )
112
 
113
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
114
 
 
115
  # --- 5. API Events ---
116
  @app.on_event("startup")
117
  async def startup_event():
118
  """Load the model at startup."""
119
  load_model_and_tokenizer(MODEL_PATH)
120
 
 
121
  # --- 6. API Endpoints ---
122
  @app.get("/")
123
  async def root():
124
  """Returns the frontend."""
125
- return FileResponse('frontend/index.html')
 
126
 
127
  @app.get("/languages")
128
  def get_supported_languages():
129
  """Returns a list of supported languages."""
130
  return {"supported_languages": list(SUPPORTED_LANGUAGES.keys())}
131
 
 
132
  @app.post("/translate", response_model=TranslationResponse)
133
  async def translate(request: TranslationRequest):
134
  """Translates a single text from a source language to English."""
@@ -144,6 +156,7 @@ async def translate(request: TranslationRequest):
144
  except Exception as e:
145
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {e}")
146
 
 
147
  @app.post("/batch-translate", response_model=BatchTranslationResponse)
148
  async def batch_translate(request: BatchTranslationRequest):
149
  """Translates a batch of texts from a source language to English."""
@@ -159,19 +172,18 @@ async def batch_translate(request: BatchTranslationRequest):
159
  except Exception as e:
160
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {e}")
161
 
 
162
  @app.post("/translate-pdf", response_model=PdfTranslationResponse)
163
  async def translate_pdf(source_language: str, file: UploadFile = File(...)):
164
  """Translates a PDF file from a source language to English."""
165
  if file.content_type != "application/pdf":
166
  raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
167
 
168
- # Save the uploaded file temporarily
169
  temp_pdf_path = f"temp_{file.filename}"
170
  with open(temp_pdf_path, "wb") as buffer:
171
  shutil.copyfileobj(file.file, buffer)
172
 
173
  try:
174
- # Extract text from the PDF
175
  doc = fitz.open(temp_pdf_path)
176
  extracted_text = ""
177
  for page in doc:
@@ -181,13 +193,8 @@ async def translate_pdf(source_language: str, file: UploadFile = File(...)):
181
  if not extracted_text.strip():
182
  raise HTTPException(status_code=400, detail="Could not extract any text from the PDF.")
183
 
184
- # Split text into chunks (e.g., by paragraph) to handle large texts
185
- text_chunks = [p.strip() for p in extracted_text.split('\n') if p.strip()]
186
-
187
- # Translate the chunks in batches
188
  translated_chunks = batch_translate_text(text_chunks, source_language)
189
-
190
- # Join the translated chunks back together
191
  final_translation = "\n".join(translated_chunks)
192
 
193
  return PdfTranslationResponse(
@@ -199,15 +206,12 @@ async def translate_pdf(source_language: str, file: UploadFile = File(...)):
199
  logger.error(f"Error processing PDF: {e}")
200
  raise HTTPException(status_code=500, detail=f"An error occurred while processing the PDF: {e}")
201
  finally:
202
- # Clean up the temporary file
203
  if os.path.exists(temp_pdf_path):
204
  os.remove(temp_pdf_path)
205
 
206
 
207
- # --- 7. Example Usage (for running with uvicorn) ---
208
- # To run this API, use the following command in your terminal:
209
- # uvicorn fast_api:app --reload
210
-
211
  if __name__ == "__main__":
212
  import uvicorn
 
213
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
  """
2
  A FastAPI application for serving the translation model, inspired by interactive_translate.py.
3
  """
4
+
5
+ import os
6
  import torch
7
  from transformers import M2M100ForConditionalGeneration, NllbTokenizer
8
  from fastapi import FastAPI, HTTPException, UploadFile, File
 
13
  from typing import List
14
  import fitz # PyMuPDF
15
  import shutil
16
+
17
+ # ✅ --- 0. Hugging Face Cache Fix ---
18
+ os.environ["HF_HOME"] = "/tmp/huggingface"
19
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
20
 
21
  # --- 1. App Configuration ---
22
  logging.basicConfig(level=logging.INFO)
 
41
  model = None
42
  tokenizer = None
43
 
44
+
45
  # --- 3. Pydantic Models ---
46
  class TranslationRequest(BaseModel):
47
  text: str
48
  source_language: str
49
 
50
+
51
  class TranslationResponse(BaseModel):
52
  original_text: str
53
  translated_text: str
54
  source_language: str
55
 
56
+
57
  class BatchTranslationRequest(BaseModel):
58
  texts: List[str]
59
  source_language: str
60
 
61
+
62
  class BatchTranslationResponse(BaseModel):
63
  original_texts: List[str]
64
  translated_texts: List[str]
65
  source_language: str
66
+
67
+
68
  class PdfTranslationResponse(BaseModel):
69
  filename: str
70
  translated_text: str
 
82
  logger.info("Model and tokenizer loaded successfully!")
83
  except Exception as e:
84
  logger.error(f"Error loading model: {e}")
 
85
  raise
86
 
87
+
88
  def translate_text(text: str, src_lang: str) -> str:
89
+ """Translates a single string of text to English."""
 
 
90
  if src_lang not in SUPPORTED_LANGUAGES:
91
  raise ValueError(f"Language '{src_lang}' not supported.")
92
 
 
101
 
102
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
103
 
104
+
105
  def batch_translate_text(texts: List[str], src_lang: str) -> List[str]:
106
+ """Translates a batch of texts to English."""
 
 
107
  if src_lang not in SUPPORTED_LANGUAGES:
108
  raise ValueError(f"Language '{src_lang}' not supported.")
109
 
110
  tokenizer.src_lang = SUPPORTED_LANGUAGES[src_lang]
111
+ inputs = tokenizer(
112
+ texts, return_tensors="pt", padding=True, truncation=True, max_length=512
113
+ ).to(DEVICE)
114
 
115
  generated_tokens = model.generate(
116
  **inputs,
117
  forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"),
118
+ max_length=512,
119
  )
120
 
121
  return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
122
 
123
+
124
  # --- 5. API Events ---
125
  @app.on_event("startup")
126
  async def startup_event():
127
  """Load the model at startup."""
128
  load_model_and_tokenizer(MODEL_PATH)
129
 
130
+
131
  # --- 6. API Endpoints ---
132
  @app.get("/")
133
  async def root():
134
  """Returns the frontend."""
135
+ return FileResponse("frontend/index.html")
136
+
137
 
138
  @app.get("/languages")
139
  def get_supported_languages():
140
  """Returns a list of supported languages."""
141
  return {"supported_languages": list(SUPPORTED_LANGUAGES.keys())}
142
 
143
+
144
  @app.post("/translate", response_model=TranslationResponse)
145
  async def translate(request: TranslationRequest):
146
  """Translates a single text from a source language to English."""
 
156
  except Exception as e:
157
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {e}")
158
 
159
+
160
  @app.post("/batch-translate", response_model=BatchTranslationResponse)
161
  async def batch_translate(request: BatchTranslationRequest):
162
  """Translates a batch of texts from a source language to English."""
 
172
  except Exception as e:
173
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {e}")
174
 
175
+
176
  @app.post("/translate-pdf", response_model=PdfTranslationResponse)
177
  async def translate_pdf(source_language: str, file: UploadFile = File(...)):
178
  """Translates a PDF file from a source language to English."""
179
  if file.content_type != "application/pdf":
180
  raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
181
 
 
182
  temp_pdf_path = f"temp_{file.filename}"
183
  with open(temp_pdf_path, "wb") as buffer:
184
  shutil.copyfileobj(file.file, buffer)
185
 
186
  try:
 
187
  doc = fitz.open(temp_pdf_path)
188
  extracted_text = ""
189
  for page in doc:
 
193
  if not extracted_text.strip():
194
  raise HTTPException(status_code=400, detail="Could not extract any text from the PDF.")
195
 
196
+ text_chunks = [p.strip() for p in extracted_text.split("\n") if p.strip()]
 
 
 
197
  translated_chunks = batch_translate_text(text_chunks, source_language)
 
 
198
  final_translation = "\n".join(translated_chunks)
199
 
200
  return PdfTranslationResponse(
 
206
  logger.error(f"Error processing PDF: {e}")
207
  raise HTTPException(status_code=500, detail=f"An error occurred while processing the PDF: {e}")
208
  finally:
 
209
  if os.path.exists(temp_pdf_path):
210
  os.remove(temp_pdf_path)
211
 
212
 
213
+ # --- 7. Run Locally ---
 
 
 
214
  if __name__ == "__main__":
215
  import uvicorn
216
+
217
  uvicorn.run(app, host="0.0.0.0", port=8000)