Spaces:
Sleeping
Sleeping
fix: handle long text translation by chunking
Browse files
app.py
CHANGED
|
@@ -57,8 +57,35 @@ async def translate_text(request: TranslationRequest):
|
|
| 57 |
if not state["model_loaded"]:
|
| 58 |
return JSONResponse(content={"message": "Model is not loaded yet"}, status_code=503)
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
@app.get("/health")
|
| 64 |
async def health_check():
|
|
|
|
| 57 |
if not state["model_loaded"]:
|
| 58 |
return JSONResponse(content={"message": "Model is not loaded yet"}, status_code=503)
|
| 59 |
|
| 60 |
+
# Split the text into chunks
|
| 61 |
+
text_chunks = split_text(request.text)
|
| 62 |
+
|
| 63 |
+
# Translate each chunk
|
| 64 |
+
translated_chunks = []
|
| 65 |
+
for chunk in text_chunks:
|
| 66 |
+
# The translator returns a list of dictionaries
|
| 67 |
+
translated_chunk = state["translator"](chunk, max_length=512)
|
| 68 |
+
translated_chunks.append(translated_chunk[0]['translation_text'])
|
| 69 |
+
|
| 70 |
+
# Join the translated chunks
|
| 71 |
+
translated_text = "".join(translated_chunks)
|
| 72 |
+
|
| 73 |
+
return {"translated_text": translated_text}
|
| 74 |
+
|
| 75 |
+
def split_text(text: str, max_length: int = 512):
|
| 76 |
+
# A simple way to split text by chunks of max_length
|
| 77 |
+
# A more sophisticated approach could split by sentences.
|
| 78 |
+
text_chunks = []
|
| 79 |
+
while len(text) > max_length:
|
| 80 |
+
# Find the last space to avoid splitting words
|
| 81 |
+
split_at = text.rfind(' ', 0, max_length)
|
| 82 |
+
if split_at == -1:
|
| 83 |
+
# No space found, split at max_length
|
| 84 |
+
split_at = max_length
|
| 85 |
+
text_chunks.append(text[:split_at])
|
| 86 |
+
text = text[split_at:].lstrip()
|
| 87 |
+
text_chunks.append(text)
|
| 88 |
+
return text_chunks
|
| 89 |
|
| 90 |
@app.get("/health")
|
| 91 |
async def health_check():
|