Arsive2 commited on
Commit
d0d0352
·
1 Parent(s): 0bf2d2c

Updated comments

Browse files
api_server.py CHANGED
@@ -1,42 +1,38 @@
1
- from fastapi import FastAPI, UploadFile, File, Form, HTTPException
2
- from fastapi.middleware.cors import CORSMiddleware
3
- from pydantic import BaseModel
4
- from typing import Optional, Dict, Any, List
5
- import torch
6
- import os
7
  import logging
 
 
 
8
  import uvicorn
9
- from app.models.translation_model import TranslationModel
 
 
 
 
10
  from app.models.html_processor import HTMLProcessor
11
  from app.models.text_chunker import TextChunker
12
- from app.models.document_processor import DocumentProcessor
13
 
14
- # Configure logging
15
  logging.basicConfig(
16
  level=logging.INFO,
17
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
  )
19
  logger = logging.getLogger(__name__)
20
 
21
- # Initialize FastAPI app
22
  app = FastAPI(
23
  title="Universal Translator API",
24
  description="API for text, HTML, and document translation services",
25
  version="1.0.0"
26
  )
27
 
28
- # Configure CORS
29
  app.add_middleware(
30
  CORSMiddleware,
31
- allow_origins=["*"], # Adjust in production
32
  allow_credentials=True,
33
  allow_methods=["*"],
34
  allow_headers=["*"],
35
  )
36
 
37
- # Initialize model components
38
  try:
39
- # Use the CPU-optimized translation model
40
  model = TranslationModel()
41
  html_processor = HTMLProcessor()
42
  text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
@@ -47,7 +43,6 @@ except Exception as e:
47
  logger.error(f"Error initializing components: {str(e)}")
48
  initialization_error = str(e)
49
 
50
- # Define request/response models
51
  class TranslationRequest(BaseModel):
52
  text: str
53
  source_lang_code: str
@@ -96,14 +91,11 @@ async def translate_text(request: TranslationRequest):
96
  raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
97
 
98
  try:
99
- # Using the OPUS-MT/NLLB hybrid model for more efficient translation
100
  logger.info(f"Translating from {request.source_lang_code} to {request.target_lang_code}")
101
 
102
- # Create chunks using TextChunker for long texts
103
  chunks = text_chunker.create_chunks(request.text)
104
  translated_chunks = []
105
 
106
- # Translate each chunk
107
  for chunk in chunks:
108
  translated_text = model.translate(
109
  chunk.text,
@@ -112,7 +104,6 @@ async def translate_text(request: TranslationRequest):
112
  )
113
  translated_chunks.append(translated_text)
114
 
115
- # Combine translations
116
  final_translation = text_chunker.combine_translations(
117
  request.text, chunks, translated_chunks
118
  )
@@ -129,16 +120,13 @@ async def translate_html(request: HTMLTranslationRequest):
129
  raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
130
 
131
  try:
132
- # Extract text and maintain exact DOM structure
133
  text_fragments, dom_data = html_processor.extract_text(request.html)
134
 
135
  if not text_fragments:
136
  return {"translated_html": request.html} # No text to translate
137
 
138
- # Process each text fragment individually
139
  translated_fragments = []
140
 
141
- # Process in smaller batches to avoid timeouts
142
  batch_size = 10
143
  for i in range(0, len(text_fragments), batch_size):
144
  batch = text_fragments[i:i+batch_size]
@@ -155,7 +143,6 @@ async def translate_html(request: HTMLTranslationRequest):
155
  )
156
  translated_fragments.append(translated_text)
157
 
158
- # Replace the original text with translated text in the HTML structure
159
  translated_html = html_processor.replace_text(dom_data, translated_fragments)
160
 
161
  return {"translated_html": translated_html}
@@ -175,10 +162,8 @@ async def process_document(
175
  raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
176
 
177
  try:
178
- # Read file content
179
  file_content = await file.read()
180
 
181
- # Process document to extract text
182
  extracted_text = document_processor.process_document(
183
  file_data=file_content,
184
  filename=file.filename,
@@ -191,7 +176,6 @@ async def process_document(
191
  detail="No text could be extracted from the document"
192
  )
193
 
194
- # Translate the extracted text using our more efficient model
195
  translated_text = model.translate(
196
  extracted_text,
197
  source_lang_code,
@@ -207,4 +191,5 @@ async def process_document(
207
  raise HTTPException(status_code=500, detail=str(e))
208
 
209
  if __name__ == "__main__":
210
- uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)
 
 
 
 
 
 
 
 
1
  import logging
2
+ import os
3
+
4
+ import torch
5
  import uvicorn
6
+ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel
9
+
10
+ from app.models.document_processor import DocumentProcessor
11
  from app.models.html_processor import HTMLProcessor
12
  from app.models.text_chunker import TextChunker
13
+ from app.models.translation_model import TranslationModel
14
 
 
15
  logging.basicConfig(
16
  level=logging.INFO,
17
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
  )
19
  logger = logging.getLogger(__name__)
20
 
 
21
  app = FastAPI(
22
  title="Universal Translator API",
23
  description="API for text, HTML, and document translation services",
24
  version="1.0.0"
25
  )
26
 
 
27
  app.add_middleware(
28
  CORSMiddleware,
29
+ allow_origins=["*"],
30
  allow_credentials=True,
31
  allow_methods=["*"],
32
  allow_headers=["*"],
33
  )
34
 
 
35
  try:
 
36
  model = TranslationModel()
37
  html_processor = HTMLProcessor()
38
  text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
 
43
  logger.error(f"Error initializing components: {str(e)}")
44
  initialization_error = str(e)
45
 
 
46
  class TranslationRequest(BaseModel):
47
  text: str
48
  source_lang_code: str
 
91
  raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
92
 
93
  try:
 
94
  logger.info(f"Translating from {request.source_lang_code} to {request.target_lang_code}")
95
 
 
96
  chunks = text_chunker.create_chunks(request.text)
97
  translated_chunks = []
98
 
 
99
  for chunk in chunks:
100
  translated_text = model.translate(
101
  chunk.text,
 
104
  )
105
  translated_chunks.append(translated_text)
106
 
 
107
  final_translation = text_chunker.combine_translations(
108
  request.text, chunks, translated_chunks
109
  )
 
120
  raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
121
 
122
  try:
 
123
  text_fragments, dom_data = html_processor.extract_text(request.html)
124
 
125
  if not text_fragments:
126
  return {"translated_html": request.html} # No text to translate
127
 
 
128
  translated_fragments = []
129
 
 
130
  batch_size = 10
131
  for i in range(0, len(text_fragments), batch_size):
132
  batch = text_fragments[i:i+batch_size]
 
143
  )
144
  translated_fragments.append(translated_text)
145
 
 
146
  translated_html = html_processor.replace_text(dom_data, translated_fragments)
147
 
148
  return {"translated_html": translated_html}
 
162
  raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")
163
 
164
  try:
 
165
  file_content = await file.read()
166
 
 
167
  extracted_text = document_processor.process_document(
168
  file_data=file_content,
169
  filename=file.filename,
 
176
  detail="No text could be extracted from the document"
177
  )
178
 
 
179
  translated_text = model.translate(
180
  extracted_text,
181
  source_lang_code,
 
191
  raise HTTPException(status_code=500, detail=str(e))
192
 
193
  if __name__ == "__main__":
194
+ uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)
195
+
app/models/document_processor.py CHANGED
@@ -1,7 +1,8 @@
1
- import fitz # PyMuPDF
2
  import logging
3
  from pathlib import Path
4
 
 
 
5
  logger = logging.getLogger(__name__)
6
 
7
  class DocumentProcessor:
@@ -35,15 +36,11 @@ class DocumentProcessor:
35
  if file_ext not in self.supported_formats:
36
  raise ValueError(f"Unsupported file format: {file_ext}")
37
 
38
- # Process PDF using PyMuPDF
39
  if file_ext == '.pdf':
40
  return self._process_pdf(file_data)
41
 
42
- # Process image (placeholder - would need OCR integration)
43
  else:
44
  if use_ocr:
45
- # Placeholder for OCR implementation
46
- # You would integrate with an OCR service here
47
  raise NotImplementedError("OCR for images not implemented")
48
  else:
49
  return "Text extraction from images requires OCR to be enabled"
 
 
1
  import logging
2
  from pathlib import Path
3
 
4
+ import fitz # PyMuPDF
5
+
6
  logger = logging.getLogger(__name__)
7
 
8
  class DocumentProcessor:
 
36
  if file_ext not in self.supported_formats:
37
  raise ValueError(f"Unsupported file format: {file_ext}")
38
 
 
39
  if file_ext == '.pdf':
40
  return self._process_pdf(file_data)
41
 
 
42
  else:
43
  if use_ocr:
 
 
44
  raise NotImplementedError("OCR for images not implemented")
45
  else:
46
  return "Text extraction from images requires OCR to be enabled"
app/models/html_processor.py CHANGED
@@ -1,6 +1,7 @@
1
  import logging
 
 
2
  from bs4 import BeautifulSoup, NavigableString, Tag
3
- from typing import List, Tuple, Dict, Any
4
 
5
  logger = logging.getLogger(__name__)
6
 
@@ -30,14 +31,11 @@ class HTMLProcessor:
30
  - DOM map that maintains references to the exact nodes in the original structure
31
  """
32
  try:
33
- # Parse the HTML using 'html.parser' to ensure proper handling
34
  soup = BeautifulSoup(html_content, 'html.parser')
35
 
36
- # Use a list to store text fragments and their corresponding nodes
37
  text_fragments = []
38
  dom_map = {}
39
 
40
- # Process the soup to find all text nodes
41
  self._extract_text_from_node(soup, text_fragments, dom_map)
42
 
43
  return text_fragments, {'soup': soup, 'node_map': dom_map}
@@ -56,24 +54,19 @@ class HTMLProcessor:
56
  dom_map: Dictionary to map indices to nodes
57
  path: Current path in the DOM tree for debugging
58
  """
59
- # Skip processing for certain tags
60
  if isinstance(node, Tag) and node.name in self.skip_tags:
61
  return
62
 
63
- # Skip elements with notranslate class
64
  if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
65
  return
66
 
67
- # Process this node
68
  if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
69
- # Only process non-empty text
70
  text = str(node).strip()
71
  if text:
72
  index = len(text_fragments)
73
  text_fragments.append(text)
74
  dom_map[index] = node
75
 
76
- # Recursively process child nodes
77
  if isinstance(node, Tag):
78
  for child in node.children:
79
  child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
@@ -98,13 +91,10 @@ class HTMLProcessor:
98
  logger.error("Invalid DOM data for text replacement")
99
  return ""
100
 
101
- # Replace text in each node
102
  for index, node in node_map.items():
103
  if index < len(translated_fragments):
104
- # Replace the original string with the translated string
105
  node.replace_with(NavigableString(translated_fragments[index]))
106
 
107
- # Return the HTML as a string
108
  return str(soup)
109
 
110
  except Exception as e:
 
1
  import logging
2
+ from typing import Any, Dict, List, Tuple
3
+
4
  from bs4 import BeautifulSoup, NavigableString, Tag
 
5
 
6
  logger = logging.getLogger(__name__)
7
 
 
31
  - DOM map that maintains references to the exact nodes in the original structure
32
  """
33
  try:
 
34
  soup = BeautifulSoup(html_content, 'html.parser')
35
 
 
36
  text_fragments = []
37
  dom_map = {}
38
 
 
39
  self._extract_text_from_node(soup, text_fragments, dom_map)
40
 
41
  return text_fragments, {'soup': soup, 'node_map': dom_map}
 
54
  dom_map: Dictionary to map indices to nodes
55
  path: Current path in the DOM tree for debugging
56
  """
 
57
  if isinstance(node, Tag) and node.name in self.skip_tags:
58
  return
59
 
 
60
  if isinstance(node, Tag) and node.get('class') and self.skip_translation_class in node.get('class'):
61
  return
62
 
 
63
  if isinstance(node, NavigableString) and node.parent and node.parent.name not in self.skip_tags:
 
64
  text = str(node).strip()
65
  if text:
66
  index = len(text_fragments)
67
  text_fragments.append(text)
68
  dom_map[index] = node
69
 
 
70
  if isinstance(node, Tag):
71
  for child in node.children:
72
  child_path = f"{path}/{child.name}" if isinstance(child, Tag) else path
 
91
  logger.error("Invalid DOM data for text replacement")
92
  return ""
93
 
 
94
  for index, node in node_map.items():
95
  if index < len(translated_fragments):
 
96
  node.replace_with(NavigableString(translated_fragments[index]))
97
 
 
98
  return str(soup)
99
 
100
  except Exception as e:
app/models/text_chunker.py CHANGED
@@ -1,17 +1,15 @@
1
- import re
2
  import logging
3
  import os
4
- import nltk
5
-
6
- from typing import List, Optional
7
  from dataclasses import dataclass
 
 
 
8
  from nltk.tokenize import sent_tokenize
9
 
10
- # Set NLTK data path from environment variable if available
11
  nltk_data_path = os.environ.get('NLTK_DATA', '/app/nltk_data')
12
  nltk.data.path.append(nltk_data_path)
13
 
14
- # Ensure NLTK data is downloaded
15
  try:
16
  nltk.data.find('tokenizers/punkt')
17
  except LookupError:
@@ -19,7 +17,6 @@ except LookupError:
19
  nltk.download('punkt', download_dir=nltk_data_path)
20
  except Exception as e:
21
  logging.warning(f"Failed to download NLTK data: {e}")
22
- # Fallback to not using NLTK if download fails
23
 
24
  logger = logging.getLogger(__name__)
25
 
@@ -62,22 +59,16 @@ class TextChunker:
62
  if not text:
63
  return ""
64
 
65
- # Replace multiple newlines with single \n
66
  text = re.sub(r'\n\s*\n', '\n', text)
67
 
68
- # Replace other whitespace characters with space
69
  text = re.sub(r'[\r\t\f\v]', ' ', text)
70
 
71
- # Replace multiple spaces with single space
72
  text = re.sub(r' +', ' ', text)
73
 
74
- # Clean up spaces around newlines
75
  text = re.sub(r' *\n *', '\n', text)
76
 
77
- # Remove spaces at the start and end of the text
78
  text = text.strip()
79
 
80
- # Handle bullet points and lists consistently
81
  text = re.sub(r'•\s*', '• ', text)
82
  text = re.sub(r'^\s*[-*]\s+', '• ', text, flags=re.MULTILINE)
83
 
@@ -88,7 +79,6 @@ class TextChunker:
88
  Estimate the number of tokens in a text string.
89
  This is a rough approximation - actual token count may vary by tokenizer.
90
  """
91
- # Split on whitespace and punctuation
92
  words = re.findall(r'\b\w+\b|[^\w\s]', text)
93
  return len(words)
94
 
@@ -98,7 +88,6 @@ class TextChunker:
98
  return sent_tokenize(text)
99
  except Exception as e:
100
  logger.warning(f"Error in sentence tokenization: {e}")
101
- # Fallback to simple period-based splitting
102
  return [s.strip() + '.' for s in text.split('.') if s.strip()]
103
 
104
  def get_chunk_text(self, sentences: List[str], start_idx: int, max_tokens: int) -> tuple:
@@ -114,7 +103,6 @@ class TextChunker:
114
  sentence = sentences[i]
115
  sentence_tokens = self.estimate_tokens(sentence)
116
 
117
- # If single sentence exceeds max tokens, split it
118
  if sentence_tokens > max_tokens:
119
  if not current_sentences: # First sentence
120
  words = sentence.split()
@@ -134,7 +122,6 @@ class TextChunker:
134
  return chunk_text, i, is_partial
135
  break
136
 
137
- # Check if adding this sentence would exceed the limit
138
  if current_tokens + sentence_tokens > max_tokens and current_sentences:
139
  break
140
 
@@ -160,13 +147,11 @@ class TextChunker:
160
  chunks = []
161
  current_idx = 0
162
 
163
- # Split into paragraphs if preserve_paragraphs is True
164
  if self.preserve_paragraphs:
165
  paragraphs = text.split('\n')
166
  else:
167
  paragraphs = [text]
168
 
169
- # Process each paragraph
170
  for para in paragraphs:
171
  if not para.strip():
172
  continue
@@ -182,7 +167,6 @@ class TextChunker:
182
  if not chunk_text:
183
  break
184
 
185
- # Calculate original text positions
186
  original_start = text.find(chunk_text)
187
  original_end = original_start + len(chunk_text)
188
 
@@ -222,11 +206,9 @@ class TextChunker:
222
  if len(chunks) == 1:
223
  return translations[0]
224
 
225
- # Combine translations, handling partial sentences
226
  result = []
227
  for i, (chunk, translation) in enumerate(zip(chunks, translations)):
228
  if i > 0 and chunk.is_partial_sentence:
229
- # For partial sentences, try to find a clean break point
230
  prev_translation = translations[i-1]
231
  overlap = self._find_overlap(prev_translation, translation)
232
  if overlap:
@@ -241,15 +223,14 @@ class TextChunker:
241
  if not text1 or not text2:
242
  return None
243
 
244
- # Get the last part of text1 and first part of text2
245
  end_text = text1[-100:] # Look at last 100 chars
246
  start_text = text2[:100] # Look at first 100 chars
247
 
248
- # Find the longest common substring
249
  overlap = None
250
  for length in range(min(len(end_text), len(start_text)), min_length - 1, -1):
251
  if end_text[-length:] == start_text[:length]:
252
  overlap = start_text[:length]
253
  break
254
 
255
- return overlap
 
 
 
1
  import logging
2
  import os
3
+ import re
 
 
4
  from dataclasses import dataclass
5
+ from typing import List, Optional
6
+
7
+ import nltk
8
  from nltk.tokenize import sent_tokenize
9
 
 
10
  nltk_data_path = os.environ.get('NLTK_DATA', '/app/nltk_data')
11
  nltk.data.path.append(nltk_data_path)
12
 
 
13
  try:
14
  nltk.data.find('tokenizers/punkt')
15
  except LookupError:
 
17
  nltk.download('punkt', download_dir=nltk_data_path)
18
  except Exception as e:
19
  logging.warning(f"Failed to download NLTK data: {e}")
 
20
 
21
  logger = logging.getLogger(__name__)
22
 
 
59
  if not text:
60
  return ""
61
 
 
62
  text = re.sub(r'\n\s*\n', '\n', text)
63
 
 
64
  text = re.sub(r'[\r\t\f\v]', ' ', text)
65
 
 
66
  text = re.sub(r' +', ' ', text)
67
 
 
68
  text = re.sub(r' *\n *', '\n', text)
69
 
 
70
  text = text.strip()
71
 
 
72
  text = re.sub(r'•\s*', '• ', text)
73
  text = re.sub(r'^\s*[-*]\s+', '• ', text, flags=re.MULTILINE)
74
 
 
79
  Estimate the number of tokens in a text string.
80
  This is a rough approximation - actual token count may vary by tokenizer.
81
  """
 
82
  words = re.findall(r'\b\w+\b|[^\w\s]', text)
83
  return len(words)
84
 
 
88
  return sent_tokenize(text)
89
  except Exception as e:
90
  logger.warning(f"Error in sentence tokenization: {e}")
 
91
  return [s.strip() + '.' for s in text.split('.') if s.strip()]
92
 
93
  def get_chunk_text(self, sentences: List[str], start_idx: int, max_tokens: int) -> tuple:
 
103
  sentence = sentences[i]
104
  sentence_tokens = self.estimate_tokens(sentence)
105
 
 
106
  if sentence_tokens > max_tokens:
107
  if not current_sentences: # First sentence
108
  words = sentence.split()
 
122
  return chunk_text, i, is_partial
123
  break
124
 
 
125
  if current_tokens + sentence_tokens > max_tokens and current_sentences:
126
  break
127
 
 
147
  chunks = []
148
  current_idx = 0
149
 
 
150
  if self.preserve_paragraphs:
151
  paragraphs = text.split('\n')
152
  else:
153
  paragraphs = [text]
154
 
 
155
  for para in paragraphs:
156
  if not para.strip():
157
  continue
 
167
  if not chunk_text:
168
  break
169
 
 
170
  original_start = text.find(chunk_text)
171
  original_end = original_start + len(chunk_text)
172
 
 
206
  if len(chunks) == 1:
207
  return translations[0]
208
 
 
209
  result = []
210
  for i, (chunk, translation) in enumerate(zip(chunks, translations)):
211
  if i > 0 and chunk.is_partial_sentence:
 
212
  prev_translation = translations[i-1]
213
  overlap = self._find_overlap(prev_translation, translation)
214
  if overlap:
 
223
  if not text1 or not text2:
224
  return None
225
 
 
226
  end_text = text1[-100:] # Look at last 100 chars
227
  start_text = text2[:100] # Look at first 100 chars
228
 
 
229
  overlap = None
230
  for length in range(min(len(end_text), len(start_text)), min_length - 1, -1):
231
  if end_text[-length:] == start_text[:length]:
232
  overlap = start_text[:length]
233
  break
234
 
235
+ return overlap
236
+
app/models/translation_model.py CHANGED
@@ -1,10 +1,10 @@
1
- import torch
2
  import logging
3
- import re
4
  import os
5
- from typing import Optional, Dict, Any, List
6
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
- from tqdm import tqdm
 
 
8
 
9
  logger = logging.getLogger(__name__)
10
 
 
 
1
  import logging
 
2
  import os
3
+ import re
4
+ from typing import Optional
5
+
6
+ import torch
7
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
8
 
9
  logger = logging.getLogger(__name__)
10