Spaces:

amirmadjour
/

amir

Sleeping

App Files Files Community

amirmadjour commited on Apr 30, 2025

Commit

b584aa5

1 Parent(s): 664bd5f

translation working pdf

Browse files

Files changed (5) hide show

.gitignore +103 -0
Dockerfile +1 -1
app.py +125 -9
document_processor.py +55 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,103 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+venv/
+env/
+ENV/
+.venv/
+.ENV/
+pip-wheel-metadata/
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+# Pytest
+.pytest_cache/
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pylint
+pylint-report.txt
+pylint-global.txt
+# IDEs and editors
+.vscode/
+.idea/
+*.sublime-project
+*.sublime-workspace
+# macOS
+.DS_Store
+# Linux
+*~
+# Logs and temp files
+*.log
+*.tmp
+*.bak
+# FastAPI-specific
+instance/
+*.db
+*.sqlite3
+# Static and media files (optional)
+staticfiles/
+media/
+# dotenv
+.env
+.env.*
+# Temp folders used in your app
+tmp/
+tmp/*
+./tmp/
+# Docker
+docker-compose.override.yml

Dockerfile CHANGED Viewed

@@ -1,7 +1,7 @@
 # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
 # you will also find guides on how best to write your Dockerfile
-FROM python:3.12
 RUN useradd -m -u 1000 user
 USER user

 # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
 # you will also find guides on how best to write your Dockerfile
+FROM python:3.10-slim
 RUN useradd -m -u 1000 user
 USER user

app.py CHANGED Viewed

@@ -1,17 +1,133 @@
-from fastapi import FastAPI
 from transformers import pipeline
 app = FastAPI()
-pipe = pipeline("text2text-generation", model="google/flan-t5-small")
-@app.get("/")
-def read_root():
-    return {"message": "Hello World"}
-@app.get("/generate")
-def generate(text: str):
-  output = pipe(text)
-  return {"message": output[0]["generated_text"]}

+from fastapi import FastAPI, UploadFile, File, HTTPException, Form
+from fastapi.responses import HTMLResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
+from typing import Optional
+import os
+import tempfile
+from pathlib import Path
+from document_processor import process_document, supported_formats
 from transformers import pipeline
 app = FastAPI()
+# Mount static files for CSS/JS if needed
+app.mount("/static", StaticFiles(directory="static"), name="static")
+@app.get("/", response_class=HTMLResponse)
+async def upload_form():
+    return f"""
+    <html>
+        <head>
+            <title>Document Translation Service</title>
+            <style>
+                body {{ font-family: Arial, sans-serif; max-width: 799px; margin: 0 auto; padding: 20px; }}
+                .container {{ border: 1px dashed #ccc; padding: 20px; text-align: center; border-radius: 5px; }}
+                .form-group {{ margin-bottom: 14px; }}
+                select, button {{ padding: 7px 15px; font-size: 16px; }}
+                .supported {{ margin-top: 19px; font-size: 14px; color: #666; }}
+            </style>
+        </head>
+        <body>
+            <h1>Document Translation Service</h1>
+            <div class="container">
+                <form action="/translate" method="post" enctype="multipart/form-data">
+                    <div class="form-group">
+                        <input type="file" name="file" required>
+                    </div>
+                    <div class="form-group">
+                        <select name="target_language">
+                            <option value="es">Spanish</option>
+                            <option value="fr">French</option>
+                            <option value="de">German</option>
+                            <option value="it">Italian</option>
+                            <option value="pt">Portuguese</option>
+                        </select>
+                    </div>
+                    <button type="submit">Translate Document</button>
+                </form>
+            </div>
+            <div class="supported">
+                <p>Supported formats: {', '.join(supported_formats())}</p>
+            </div>
+        </body>
+    </html>
+    """
+@app.post("/translate")
+async def translate_file(
+    file: UploadFile = File(...),
+    target_language: str = Form(...)
+):
+    try:
+        # Validate file extension
+        file_ext = Path(file.filename).suffix.lower()
+        if file_ext not in supported_formats():
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unsupported file format. Supported formats: {', '.join(supported_formats())}"
+            )
+        # Create temp file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_file:
+            temp_path = temp_file.name
+            content = await file.read()
+            temp_file.write(content)
+        # Extract text from document
+        extracted_text = process_document(temp_path)
+        # Translation model map
+        model_map = {
+            "es": "Helsinki-NLP/opus-mt-en-es",
+            "fr": "Helsinki-NLP/opus-mt-en-fr",
+            "de": "Helsinki-NLP/opus-mt-en-de",
+            "it": "Helsinki-NLP/opus-mt-en-it",
+            "pt": "Helsinki-NLP/opus-mt-en-pt"
+        }
+        model_name = model_map.get(target_language)
+        if not model_name:
+            raise HTTPException(status_code=400, detail="Unsupported target language.")
+        # Load appropriate translation pipeline
+        print(f"[DEBUG] Loading translation model: {model_name}")
+        pipe = pipeline("translation", model=model_name)
+        # Translate text
+        translated_text = pipe(extracted_text, max_length=1024)[0]["translation_text"]
+        # Save translation to text file
+        temp_dir = Path("./tmp")
+        temp_dir.mkdir(exist_ok=True)
+        output_filename = f"translated_{Path(file.filename).stem}.txt"
+        output_path = temp_dir / output_filename
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(translated_text)
+        print(f"[DEBUG] Writing translation to: {output_path}")
+        return FileResponse(
+            output_path,
+            filename=output_filename,
+            media_type="text/plain"
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        if 'temp_path' in locals() and os.path.exists(temp_path):
+            os.unlink(temp_path)
+@app.get("/download/{filename}", response_class=FileResponse)
+def download_file(filename: str):
+    filepath = Path("./tmp") / filename
+    if not filepath.exists():
+        raise HTTPException(status_code=404, detail="File not found.")
+    return FileResponse(filepath, filename=filename)
+@app.get("/health")
+def health_check():
+    return {"status": "healthy"}

document_processor.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from pathlib import Path
+import PyPDF2
+from docx import Document
+import pptx
+import pandas as pd
+import tempfile
+def supported_formats():
+    return ['.pdf', '.docx', '.pptx', '.txt', '.xlsx']
+def process_document(file_path: str) -> str:
+    """Extract text from various document formats"""
+    file_ext = Path(file_path).suffix.lower()
+    try:
+        if file_ext == '.pdf':
+            return _extract_pdf_text(file_path)
+        elif file_ext == '.docx':
+            return _extract_docx_text(file_path)
+        elif file_ext == '.pptx':
+            return _extract_pptx_text(file_path)
+        elif file_ext == '.txt':
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        elif file_ext == '.xlsx':
+            return _extract_excel_text(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {file_ext}")
+    except Exception as e:
+        raise ValueError(f"Error processing document: {str(e)}")
+def _extract_pdf_text(file_path: str) -> str:
+    text = ""
+    with open(file_path, 'rb') as f:
+        reader = PyPDF2.PdfReader(f)
+        for page in reader.pages:
+            text += page.extract_text() + "\n"
+    return text
+def _extract_docx_text(file_path: str) -> str:
+    doc = Document(file_path)
+    return "\n".join([para.text for para in doc.paragraphs])
+def _extract_pptx_text(file_path: str) -> str:
+    prs = pptx.Presentation(file_path)
+    text = []
+    for slide in prs.slides:
+        for shape in slide.shapes:
+            if hasattr(shape, "text"):
+                text.append(shape.text)
+    return "\n".join(text)
+def _extract_excel_text(file_path: str) -> str:
+    df = pd.read_excel(file_path)
+    return df.to_string()

requirements.txt CHANGED Viewed

@@ -35,3 +35,9 @@ typing-inspection==0.4.0
 typing_extensions==4.13.1
 urllib3==2.3.0
 uvicorn==0.34.0

 typing_extensions==4.13.1
 urllib3==2.3.0
 uvicorn==0.34.0
+python-multipart==0.0.9
+PyPDF2==3.0.1
+python-docx==1.1.0
+python-pptx==0.6.23
+pandas==2.2.2
+openpyxl==3.1.2