Spaces:
Sleeping
Sleeping
Commit ·
b584aa5
1
Parent(s): 664bd5f
translation working pdf
Browse files- .gitignore +103 -0
- Dockerfile +1 -1
- app.py +125 -9
- document_processor.py +55 -0
- requirements.txt +6 -0
.gitignore
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
.installed.cfg
|
| 24 |
+
*.egg
|
| 25 |
+
|
| 26 |
+
# Virtual environments
|
| 27 |
+
venv/
|
| 28 |
+
env/
|
| 29 |
+
ENV/
|
| 30 |
+
.venv/
|
| 31 |
+
.ENV/
|
| 32 |
+
pip-wheel-metadata/
|
| 33 |
+
|
| 34 |
+
# PyInstaller
|
| 35 |
+
*.manifest
|
| 36 |
+
*.spec
|
| 37 |
+
|
| 38 |
+
# Installer logs
|
| 39 |
+
pip-log.txt
|
| 40 |
+
pip-delete-this-directory.txt
|
| 41 |
+
|
| 42 |
+
# Unit test / coverage reports
|
| 43 |
+
htmlcov/
|
| 44 |
+
.tox/
|
| 45 |
+
.nox/
|
| 46 |
+
.coverage
|
| 47 |
+
.coverage.*
|
| 48 |
+
.cache
|
| 49 |
+
nosetests.xml
|
| 50 |
+
coverage.xml
|
| 51 |
+
*.cover
|
| 52 |
+
*.py,cover
|
| 53 |
+
.hypothesis/
|
| 54 |
+
|
| 55 |
+
# Pytest
|
| 56 |
+
.pytest_cache/
|
| 57 |
+
|
| 58 |
+
# mypy
|
| 59 |
+
.mypy_cache/
|
| 60 |
+
.dmypy.json
|
| 61 |
+
dmypy.json
|
| 62 |
+
|
| 63 |
+
# Pylint
|
| 64 |
+
pylint-report.txt
|
| 65 |
+
pylint-global.txt
|
| 66 |
+
|
| 67 |
+
# IDEs and editors
|
| 68 |
+
.vscode/
|
| 69 |
+
.idea/
|
| 70 |
+
*.sublime-project
|
| 71 |
+
*.sublime-workspace
|
| 72 |
+
|
| 73 |
+
# macOS
|
| 74 |
+
.DS_Store
|
| 75 |
+
|
| 76 |
+
# Linux
|
| 77 |
+
*~
|
| 78 |
+
|
| 79 |
+
# Logs and temp files
|
| 80 |
+
*.log
|
| 81 |
+
*.tmp
|
| 82 |
+
*.bak
|
| 83 |
+
|
| 84 |
+
# FastAPI-specific
|
| 85 |
+
instance/
|
| 86 |
+
*.db
|
| 87 |
+
*.sqlite3
|
| 88 |
+
|
| 89 |
+
# Static and media files (optional)
|
| 90 |
+
staticfiles/
|
| 91 |
+
media/
|
| 92 |
+
|
| 93 |
+
# dotenv
|
| 94 |
+
.env
|
| 95 |
+
.env.*
|
| 96 |
+
|
| 97 |
+
# Temp folders used in your app
|
| 98 |
+
tmp/
|
| 99 |
+
tmp/*
|
| 100 |
+
./tmp/
|
| 101 |
+
|
| 102 |
+
# Docker
|
| 103 |
+
docker-compose.override.yml
|
Dockerfile
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
| 2 |
# you will also find guides on how best to write your Dockerfile
|
| 3 |
|
| 4 |
-
FROM python:3.
|
| 5 |
|
| 6 |
RUN useradd -m -u 1000 user
|
| 7 |
USER user
|
|
|
|
| 1 |
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
| 2 |
# you will also find guides on how best to write your Dockerfile
|
| 3 |
|
| 4 |
+
FROM python:3.10-slim
|
| 5 |
|
| 6 |
RUN useradd -m -u 1000 user
|
| 7 |
USER user
|
app.py
CHANGED
|
@@ -1,17 +1,133 @@
|
|
| 1 |
-
from fastapi import FastAPI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from transformers import pipeline
|
| 3 |
|
| 4 |
app = FastAPI()
|
| 5 |
|
| 6 |
-
|
|
|
|
| 7 |
|
| 8 |
-
@app.get("/")
|
| 9 |
-
def
|
| 10 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
@app.
|
| 13 |
-
def
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
|
|
|
| 1 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException, Form
|
| 2 |
+
from fastapi.responses import HTMLResponse, FileResponse
|
| 3 |
+
from fastapi.staticfiles import StaticFiles
|
| 4 |
+
from typing import Optional
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from document_processor import process_document, supported_formats
|
| 9 |
from transformers import pipeline
|
| 10 |
|
| 11 |
app = FastAPI()
|
| 12 |
|
| 13 |
+
# Mount static files for CSS/JS if needed
|
| 14 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 15 |
|
| 16 |
+
@app.get("/", response_class=HTMLResponse)
|
| 17 |
+
async def upload_form():
|
| 18 |
+
return f"""
|
| 19 |
+
<html>
|
| 20 |
+
<head>
|
| 21 |
+
<title>Document Translation Service</title>
|
| 22 |
+
<style>
|
| 23 |
+
body {{ font-family: Arial, sans-serif; max-width: 799px; margin: 0 auto; padding: 20px; }}
|
| 24 |
+
.container {{ border: 1px dashed #ccc; padding: 20px; text-align: center; border-radius: 5px; }}
|
| 25 |
+
.form-group {{ margin-bottom: 14px; }}
|
| 26 |
+
select, button {{ padding: 7px 15px; font-size: 16px; }}
|
| 27 |
+
.supported {{ margin-top: 19px; font-size: 14px; color: #666; }}
|
| 28 |
+
</style>
|
| 29 |
+
</head>
|
| 30 |
+
<body>
|
| 31 |
+
<h1>Document Translation Service</h1>
|
| 32 |
+
<div class="container">
|
| 33 |
+
<form action="/translate" method="post" enctype="multipart/form-data">
|
| 34 |
+
<div class="form-group">
|
| 35 |
+
<input type="file" name="file" required>
|
| 36 |
+
</div>
|
| 37 |
+
<div class="form-group">
|
| 38 |
+
<select name="target_language">
|
| 39 |
+
<option value="es">Spanish</option>
|
| 40 |
+
<option value="fr">French</option>
|
| 41 |
+
<option value="de">German</option>
|
| 42 |
+
<option value="it">Italian</option>
|
| 43 |
+
<option value="pt">Portuguese</option>
|
| 44 |
+
</select>
|
| 45 |
+
</div>
|
| 46 |
+
<button type="submit">Translate Document</button>
|
| 47 |
+
</form>
|
| 48 |
+
</div>
|
| 49 |
+
<div class="supported">
|
| 50 |
+
<p>Supported formats: {', '.join(supported_formats())}</p>
|
| 51 |
+
</div>
|
| 52 |
+
</body>
|
| 53 |
+
</html>
|
| 54 |
+
"""
|
| 55 |
|
| 56 |
+
@app.post("/translate")
|
| 57 |
+
async def translate_file(
|
| 58 |
+
file: UploadFile = File(...),
|
| 59 |
+
target_language: str = Form(...)
|
| 60 |
+
):
|
| 61 |
+
try:
|
| 62 |
+
# Validate file extension
|
| 63 |
+
file_ext = Path(file.filename).suffix.lower()
|
| 64 |
+
if file_ext not in supported_formats():
|
| 65 |
+
raise HTTPException(
|
| 66 |
+
status_code=400,
|
| 67 |
+
detail=f"Unsupported file format. Supported formats: {', '.join(supported_formats())}"
|
| 68 |
+
)
|
| 69 |
|
| 70 |
+
# Create temp file
|
| 71 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_file:
|
| 72 |
+
temp_path = temp_file.name
|
| 73 |
+
content = await file.read()
|
| 74 |
+
temp_file.write(content)
|
| 75 |
+
|
| 76 |
+
# Extract text from document
|
| 77 |
+
extracted_text = process_document(temp_path)
|
| 78 |
+
|
| 79 |
+
# Translation model map
|
| 80 |
+
model_map = {
|
| 81 |
+
"es": "Helsinki-NLP/opus-mt-en-es",
|
| 82 |
+
"fr": "Helsinki-NLP/opus-mt-en-fr",
|
| 83 |
+
"de": "Helsinki-NLP/opus-mt-en-de",
|
| 84 |
+
"it": "Helsinki-NLP/opus-mt-en-it",
|
| 85 |
+
"pt": "Helsinki-NLP/opus-mt-en-pt"
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
model_name = model_map.get(target_language)
|
| 89 |
+
if not model_name:
|
| 90 |
+
raise HTTPException(status_code=400, detail="Unsupported target language.")
|
| 91 |
+
|
| 92 |
+
# Load appropriate translation pipeline
|
| 93 |
+
print(f"[DEBUG] Loading translation model: {model_name}")
|
| 94 |
+
pipe = pipeline("translation", model=model_name)
|
| 95 |
+
|
| 96 |
+
# Translate text
|
| 97 |
+
translated_text = pipe(extracted_text, max_length=1024)[0]["translation_text"]
|
| 98 |
+
|
| 99 |
+
# Save translation to text file
|
| 100 |
+
temp_dir = Path("./tmp")
|
| 101 |
+
temp_dir.mkdir(exist_ok=True)
|
| 102 |
+
|
| 103 |
+
output_filename = f"translated_{Path(file.filename).stem}.txt"
|
| 104 |
+
output_path = temp_dir / output_filename
|
| 105 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 106 |
+
f.write(translated_text)
|
| 107 |
+
|
| 108 |
+
print(f"[DEBUG] Writing translation to: {output_path}")
|
| 109 |
+
|
| 110 |
+
return FileResponse(
|
| 111 |
+
output_path,
|
| 112 |
+
filename=output_filename,
|
| 113 |
+
media_type="text/plain"
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 118 |
+
|
| 119 |
+
finally:
|
| 120 |
+
if 'temp_path' in locals() and os.path.exists(temp_path):
|
| 121 |
+
os.unlink(temp_path)
|
| 122 |
+
|
| 123 |
+
@app.get("/download/{filename}", response_class=FileResponse)
|
| 124 |
+
def download_file(filename: str):
|
| 125 |
+
filepath = Path("./tmp") / filename
|
| 126 |
+
if not filepath.exists():
|
| 127 |
+
raise HTTPException(status_code=404, detail="File not found.")
|
| 128 |
+
return FileResponse(filepath, filename=filename)
|
| 129 |
+
|
| 130 |
+
@app.get("/health")
|
| 131 |
+
def health_check():
|
| 132 |
+
return {"status": "healthy"}
|
| 133 |
|
document_processor.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import PyPDF2
|
| 3 |
+
from docx import Document
|
| 4 |
+
import pptx
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import tempfile
|
| 7 |
+
|
| 8 |
+
def supported_formats():
|
| 9 |
+
return ['.pdf', '.docx', '.pptx', '.txt', '.xlsx']
|
| 10 |
+
|
| 11 |
+
def process_document(file_path: str) -> str:
|
| 12 |
+
"""Extract text from various document formats"""
|
| 13 |
+
file_ext = Path(file_path).suffix.lower()
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
if file_ext == '.pdf':
|
| 17 |
+
return _extract_pdf_text(file_path)
|
| 18 |
+
elif file_ext == '.docx':
|
| 19 |
+
return _extract_docx_text(file_path)
|
| 20 |
+
elif file_ext == '.pptx':
|
| 21 |
+
return _extract_pptx_text(file_path)
|
| 22 |
+
elif file_ext == '.txt':
|
| 23 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 24 |
+
return f.read()
|
| 25 |
+
elif file_ext == '.xlsx':
|
| 26 |
+
return _extract_excel_text(file_path)
|
| 27 |
+
else:
|
| 28 |
+
raise ValueError(f"Unsupported file format: {file_ext}")
|
| 29 |
+
except Exception as e:
|
| 30 |
+
raise ValueError(f"Error processing document: {str(e)}")
|
| 31 |
+
|
| 32 |
+
def _extract_pdf_text(file_path: str) -> str:
|
| 33 |
+
text = ""
|
| 34 |
+
with open(file_path, 'rb') as f:
|
| 35 |
+
reader = PyPDF2.PdfReader(f)
|
| 36 |
+
for page in reader.pages:
|
| 37 |
+
text += page.extract_text() + "\n"
|
| 38 |
+
return text
|
| 39 |
+
|
| 40 |
+
def _extract_docx_text(file_path: str) -> str:
|
| 41 |
+
doc = Document(file_path)
|
| 42 |
+
return "\n".join([para.text for para in doc.paragraphs])
|
| 43 |
+
|
| 44 |
+
def _extract_pptx_text(file_path: str) -> str:
|
| 45 |
+
prs = pptx.Presentation(file_path)
|
| 46 |
+
text = []
|
| 47 |
+
for slide in prs.slides:
|
| 48 |
+
for shape in slide.shapes:
|
| 49 |
+
if hasattr(shape, "text"):
|
| 50 |
+
text.append(shape.text)
|
| 51 |
+
return "\n".join(text)
|
| 52 |
+
|
| 53 |
+
def _extract_excel_text(file_path: str) -> str:
|
| 54 |
+
df = pd.read_excel(file_path)
|
| 55 |
+
return df.to_string()
|
requirements.txt
CHANGED
|
@@ -35,3 +35,9 @@ typing-inspection==0.4.0
|
|
| 35 |
typing_extensions==4.13.1
|
| 36 |
urllib3==2.3.0
|
| 37 |
uvicorn==0.34.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
typing_extensions==4.13.1
|
| 36 |
urllib3==2.3.0
|
| 37 |
uvicorn==0.34.0
|
| 38 |
+
python-multipart==0.0.9
|
| 39 |
+
PyPDF2==3.0.1
|
| 40 |
+
python-docx==1.1.0
|
| 41 |
+
python-pptx==0.6.23
|
| 42 |
+
pandas==2.2.2
|
| 43 |
+
openpyxl==3.1.2
|