updated api server
Browse files- api_server.py +19 -2
- app/models/html_processor.py +23 -0
api_server.py
CHANGED
|
@@ -47,6 +47,7 @@ class TranslationRequest(BaseModel):
|
|
| 47 |
text: str
|
| 48 |
source_lang_code: str
|
| 49 |
target_lang_code: str
|
|
|
|
| 50 |
|
| 51 |
class TranslationResponse(BaseModel):
|
| 52 |
translated_text: str
|
|
@@ -55,6 +56,7 @@ class HTMLTranslationRequest(BaseModel):
|
|
| 55 |
html: str
|
| 56 |
source_lang_code: str
|
| 57 |
target_lang_code: str
|
|
|
|
| 58 |
|
| 59 |
class HTMLTranslationResponse(BaseModel):
|
| 60 |
translated_html: str
|
|
@@ -93,6 +95,9 @@ async def translate_text(request: TranslationRequest):
|
|
| 93 |
try:
|
| 94 |
logger.info(f"Translating from {request.source_lang_code} to {request.target_lang_code}")
|
| 95 |
|
|
|
|
|
|
|
|
|
|
| 96 |
chunks = text_chunker.create_chunks(request.text)
|
| 97 |
translated_chunks = []
|
| 98 |
|
|
@@ -125,6 +130,14 @@ async def translate_html(request: HTMLTranslationRequest):
|
|
| 125 |
if not text_fragments:
|
| 126 |
return {"translated_html": request.html} # No text to translate
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
translated_fragments = []
|
| 129 |
|
| 130 |
batch_size = 10
|
|
@@ -155,6 +168,7 @@ async def process_document(
|
|
| 155 |
file: UploadFile = File(...),
|
| 156 |
source_lang_code: str = Form(...),
|
| 157 |
target_lang_code: str = Form(...),
|
|
|
|
| 158 |
use_ocr: bool = Form(False)
|
| 159 |
):
|
| 160 |
"""Process and translate document (PDF or image)"""
|
|
@@ -175,6 +189,10 @@ async def process_document(
|
|
| 175 |
status_code=400,
|
| 176 |
detail="No text could be extracted from the document"
|
| 177 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
translated_text = model.translate(
|
| 180 |
extracted_text,
|
|
@@ -191,5 +209,4 @@ async def process_document(
|
|
| 191 |
raise HTTPException(status_code=500, detail=str(e))
|
| 192 |
|
| 193 |
if __name__ == "__main__":
|
| 194 |
-
uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)
|
| 195 |
-
|
|
|
|
| 47 |
text: str
|
| 48 |
source_lang_code: str
|
| 49 |
target_lang_code: str
|
| 50 |
+
special_token: str = ""
|
| 51 |
|
| 52 |
class TranslationResponse(BaseModel):
|
| 53 |
translated_text: str
|
|
|
|
| 56 |
html: str
|
| 57 |
source_lang_code: str
|
| 58 |
target_lang_code: str
|
| 59 |
+
special_token: str = ""
|
| 60 |
|
| 61 |
class HTMLTranslationResponse(BaseModel):
|
| 62 |
translated_html: str
|
|
|
|
| 95 |
try:
|
| 96 |
logger.info(f"Translating from {request.source_lang_code} to {request.target_lang_code}")
|
| 97 |
|
| 98 |
+
if request.special_token:
|
| 99 |
+
logger.info(f"Using special language token: {request.special_token}")
|
| 100 |
+
|
| 101 |
chunks = text_chunker.create_chunks(request.text)
|
| 102 |
translated_chunks = []
|
| 103 |
|
|
|
|
| 130 |
if not text_fragments:
|
| 131 |
return {"translated_html": request.html} # No text to translate
|
| 132 |
|
| 133 |
+
# Apply special token to each text fragment if needed
|
| 134 |
+
if request.special_token:
|
| 135 |
+
logger.info(f"Using special language token for HTML: {request.special_token}")
|
| 136 |
+
text_fragments = html_processor.prepare_fragments_with_token(
|
| 137 |
+
text_fragments,
|
| 138 |
+
request.special_token
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
translated_fragments = []
|
| 142 |
|
| 143 |
batch_size = 10
|
|
|
|
| 168 |
file: UploadFile = File(...),
|
| 169 |
source_lang_code: str = Form(...),
|
| 170 |
target_lang_code: str = Form(...),
|
| 171 |
+
special_token: str = Form(""),
|
| 172 |
use_ocr: bool = Form(False)
|
| 173 |
):
|
| 174 |
"""Process and translate document (PDF or image)"""
|
|
|
|
| 189 |
status_code=400,
|
| 190 |
detail="No text could be extracted from the document"
|
| 191 |
)
|
| 192 |
+
|
| 193 |
+
if special_token:
|
| 194 |
+
logger.info(f"Using special language token for document: {special_token}")
|
| 195 |
+
extracted_text = f"{special_token}{extracted_text}"
|
| 196 |
|
| 197 |
translated_text = model.translate(
|
| 198 |
extracted_text,
|
|
|
|
| 209 |
raise HTTPException(status_code=500, detail=str(e))
|
| 210 |
|
| 211 |
if __name__ == "__main__":
|
| 212 |
+
uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)
|
|
|
app/models/html_processor.py
CHANGED
|
@@ -100,3 +100,26 @@ class HTMLProcessor:
|
|
| 100 |
except Exception as e:
|
| 101 |
logger.error(f"Error replacing text in HTML: {str(e)}")
|
| 102 |
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
except Exception as e:
|
| 101 |
logger.error(f"Error replacing text in HTML: {str(e)}")
|
| 102 |
return ""
|
| 103 |
+
|
| 104 |
+
def prepare_fragments_with_token(self, fragments: List[str], special_token: str) -> List[str]:
|
| 105 |
+
"""
|
| 106 |
+
Prepare text fragments by adding special language token to each fragment.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
fragments: List of text fragments
|
| 110 |
+
special_token: Special language token to add (e.g., '>>tam<<')
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
List of fragments with token added
|
| 114 |
+
"""
|
| 115 |
+
if not special_token:
|
| 116 |
+
return fragments
|
| 117 |
+
|
| 118 |
+
prepared_fragments = []
|
| 119 |
+
for fragment in fragments:
|
| 120 |
+
if fragment.strip():
|
| 121 |
+
prepared_fragments.append(f"{special_token}{fragment}")
|
| 122 |
+
else:
|
| 123 |
+
prepared_fragments.append(fragment)
|
| 124 |
+
|
| 125 |
+
return prepared_fragments
|