Spaces:

Arsive
/

lt_space

Sleeping

App Files Files Community

lt_space / api_server.py

Arsive2

Updated ct translate

6a6828e 9 months ago

raw

history blame contribute delete

9.68 kB

	import logging
	import os

	import torch
	import uvicorn
	from fastapi import FastAPI, File, Form, HTTPException, UploadFile
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel

	from app.models.document_processor import DocumentProcessor
	from app.models.html_processor import HTMLProcessor
	from app.models.text_chunker import TextChunker
	from app.models.translation_model_ct2 import TranslationModelCT2

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	app = FastAPI(
	title="Universal Translator API",
	description="API for text, HTML, and document translation services",
	version="1.0.0"
	)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	try:
	model = TranslationModelCT2(model_cache_dir=os.getenv("CT2_MODEL_CACHE", ".cache/ct2_models"))
	html_processor = HTMLProcessor()
	text_chunker = TextChunker(max_tokens=250, overlap_tokens=30)
	document_processor = DocumentProcessor()

	initialization_error = None
	except Exception as e:
	logger.error(f"Error initializing components: {str(e)}")
	initialization_error = str(e)

	class TranslationRequest(BaseModel):
	text: str
	source_lang_code: str
	target_lang_code: str

	class TranslationResponse(BaseModel):
	translated_text: str

	class HTMLTranslationRequest(BaseModel):
	html: str
	source_lang_code: str
	target_lang_code: str

	class HTMLTranslationResponse(BaseModel):
	translated_html: str

	@app.get("/")
	async def root():
	"""Health check endpoint"""
	if initialization_error:
	return {
	"status": "error",
	"message": "Service initialization failed",
	"error": initialization_error
	}
	return {"status": "ok", "model": "OPUS-MT/NLLB-CPU-Optimized", "version": "1.0"}

	@app.get("/health")
	async def health_check():
	"""Extended health check with environment information"""
	return {
	"status": "ok" if not initialization_error else "error",
	"error": initialization_error,
	"environment": {
	"python_version": os.environ.get('PYTHON_VERSION'),
	"cuda_available": torch.cuda.is_available(),
	"device": str(model.device) if hasattr(model, 'device') else "Unknown",
	"model_info": model.get_model_info() if hasattr(model, 'get_model_info') else {}
	}
	}

	@app.post("/translate", response_model=TranslationResponse)
	async def translate_text(request: TranslationRequest):
	"""Translate text from source to target language"""
	if initialization_error:
	raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")

	try:
	logger.info(f"Translating from {request.source_lang_code} to {request.target_lang_code}")

	modified_text = request.text
	modified_target_code = request.target_lang_code

	if request.target_lang_code == "tam":
	modified_text = f">>tam<<{request.text}"
	modified_target_code = "dra"
	elif request.target_lang_code == "tel":
	modified_text = f">>tel<<{request.text}"
	modified_target_code = "dra"
	elif request.target_lang_code == "kan":
	modified_text = f">>kan<<{request.text}"
	modified_target_code = "dra"
	elif request.target_lang_code == "mal":
	modified_text = f">>mal<<{request.text}"
	modified_target_code = "dra"

	if len(modified_text) > 1000:
	chunks = text_chunker.create_chunks(modified_text)
	chunk_texts = [chunk.text for chunk in chunks]

	translated_chunks = model.translate_batch(
	chunk_texts,
	request.source_lang_code,
	modified_target_code
	)

	final_translation = text_chunker.combine_translations(
	modified_text, chunks, translated_chunks
	)
	else:

	final_translation = model.translate(
	modified_text,
	request.source_lang_code,
	modified_target_code
	)

	return {"translated_text": final_translation}
	except Exception as e:
	logger.error(f"Translation error: {str(e)}")
	raise HTTPException(status_code=500, detail=str(e))

	@app.post("/translate-html", response_model=HTMLTranslationResponse)
	async def translate_html(request: HTMLTranslationRequest):
	"""Translate HTML content while preserving structure"""
	if initialization_error:
	raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")

	try:
	text_fragments, dom_data = html_processor.extract_text(request.html)

	if not text_fragments:
	return {"translated_html": request.html} # No text to translate

	modified_target_code = request.target_lang_code
	special_token = ""

	if request.target_lang_code == "tam":
	special_token = ">>tam<<"
	modified_target_code = "dra"
	elif request.target_lang_code == "tel":
	special_token = ">>tel<<"
	modified_target_code = "dra"
	elif request.target_lang_code == "kan":
	special_token = ">>kan<<"
	modified_target_code = "dra"
	elif request.target_lang_code == "mal":
	special_token = ">>mal<<"
	modified_target_code = "dra"

	if special_token:
	logger.info(f"Using special language token for HTML: {special_token}")
	modified_fragments = []
	for fragment in text_fragments:
	if fragment.strip():
	modified_fragments.append(f"{special_token}{fragment}")
	else:
	modified_fragments.append(fragment)
	else:
	modified_fragments = text_fragments

	non_empty_fragments = []
	empty_indices = []
	for i, fragment in enumerate(modified_fragments):
	if fragment.strip():
	non_empty_fragments.append(fragment)
	else:
	empty_indices.append(i)

	translated_fragments = model.translate_batch(
	non_empty_fragments,
	request.source_lang_code,
	modified_target_code
	)

	full_translated_fragments = []
	non_empty_idx = 0

	for i in range(len(modified_fragments)):
	if i in empty_indices:
	full_translated_fragments.append("")
	else:
	full_translated_fragments.append(translated_fragments[non_empty_idx])
	non_empty_idx += 1

	translated_html = html_processor.replace_text(dom_data, full_translated_fragments)

	return {"translated_html": translated_html}
	except Exception as e:
	logger.error(f"HTML translation error: {str(e)}")
	raise HTTPException(status_code=500, detail=str(e))

	@app.post("/process-document")
	async def process_document(
	file: UploadFile = File(...),
	source_lang_code: str = Form(...),
	target_lang_code: str = Form(...),
	use_ocr: bool = Form(False)
	):
	"""Process and translate document (PDF or image)"""
	if initialization_error:
	raise HTTPException(status_code=500, detail=f"Service not properly initialized: {initialization_error}")

	try:
	file_content = await file.read()

	extracted_text = document_processor.process_document(
	file_data=file_content,
	filename=file.filename,
	use_ocr=use_ocr
	)

	if not extracted_text:
	raise HTTPException(
	status_code=400,
	detail="No text could be extracted from the document"
	)

	modified_target_code = target_lang_code
	modified_text = extracted_text

	if target_lang_code == "tam":
	modified_text = f">>tam<<{extracted_text}"
	modified_target_code = "dra"
	elif target_lang_code == "tel":
	modified_text = f">>tel<<{extracted_text}"
	modified_target_code = "dra"
	elif target_lang_code == "kan":
	modified_text = f">>kan<<{extracted_text}"
	modified_target_code = "dra"
	elif target_lang_code == "mal":
	modified_text = f">>mal<<{extracted_text}"
	modified_target_code = "dra"

	if len(modified_text) > 1000:
	chunks = text_chunker.create_chunks(modified_text)
	chunk_texts = [chunk.text for chunk in chunks]

	translated_chunks = model.translate_batch(
	chunk_texts,
	source_lang_code,
	modified_target_code
	)

	translated_text = text_chunker.combine_translations(
	modified_text, chunks, translated_chunks
	)
	else:
	translated_text = model.translate(
	modified_text,
	source_lang_code,
	modified_target_code
	)

	return {
	"extracted_text": extracted_text,
	"translated_text": translated_text
	}
	except Exception as e:
	logger.error(f"Document processing error: {str(e)}")
	raise HTTPException(status_code=500, detail=str(e))

	if __name__ == "__main__":
	uvicorn.run("api_server:app", host="0.0.0.0", port=7860, reload=True)