amirmadjour commited on
Commit
b584aa5
·
1 Parent(s): 664bd5f

translation working pdf

Browse files
Files changed (5) hide show
  1. .gitignore +103 -0
  2. Dockerfile +1 -1
  3. app.py +125 -9
  4. document_processor.py +55 -0
  5. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # Virtual environments
27
+ venv/
28
+ env/
29
+ ENV/
30
+ .venv/
31
+ .ENV/
32
+ pip-wheel-metadata/
33
+
34
+ # PyInstaller
35
+ *.manifest
36
+ *.spec
37
+
38
+ # Installer logs
39
+ pip-log.txt
40
+ pip-delete-this-directory.txt
41
+
42
+ # Unit test / coverage reports
43
+ htmlcov/
44
+ .tox/
45
+ .nox/
46
+ .coverage
47
+ .coverage.*
48
+ .cache
49
+ nosetests.xml
50
+ coverage.xml
51
+ *.cover
52
+ *.py,cover
53
+ .hypothesis/
54
+
55
+ # Pytest
56
+ .pytest_cache/
57
+
58
+ # mypy
59
+ .mypy_cache/
60
+ .dmypy.json
61
+ dmypy.json
62
+
63
+ # Pylint
64
+ pylint-report.txt
65
+ pylint-global.txt
66
+
67
+ # IDEs and editors
68
+ .vscode/
69
+ .idea/
70
+ *.sublime-project
71
+ *.sublime-workspace
72
+
73
+ # macOS
74
+ .DS_Store
75
+
76
+ # Linux
77
+ *~
78
+
79
+ # Logs and temp files
80
+ *.log
81
+ *.tmp
82
+ *.bak
83
+
84
+ # FastAPI-specific
85
+ instance/
86
+ *.db
87
+ *.sqlite3
88
+
89
+ # Static and media files (optional)
90
+ staticfiles/
91
+ media/
92
+
93
+ # dotenv
94
+ .env
95
+ .env.*
96
+
97
+ # Temp folders used in your app
98
+ tmp/
99
+ tmp/*
100
+ ./tmp/
101
+
102
+ # Docker
103
+ docker-compose.override.yml
Dockerfile CHANGED
@@ -1,7 +1,7 @@
1
  # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
3
 
4
- FROM python:3.12
5
 
6
  RUN useradd -m -u 1000 user
7
  USER user
 
1
  # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
3
 
4
+ FROM python:3.10-slim
5
 
6
  RUN useradd -m -u 1000 user
7
  USER user
app.py CHANGED
@@ -1,17 +1,133 @@
1
- from fastapi import FastAPI
 
 
 
 
 
 
 
2
  from transformers import pipeline
3
 
4
  app = FastAPI()
5
 
6
- pipe = pipeline("text2text-generation", model="google/flan-t5-small")
 
7
 
8
- @app.get("/")
9
- def read_root():
10
- return {"message": "Hello World"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- @app.get("/generate")
13
- def generate(text: str):
14
- output = pipe(text)
 
 
 
 
 
 
 
 
 
 
15
 
16
- return {"message": output[0]["generated_text"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Form
2
+ from fastapi.responses import HTMLResponse, FileResponse
3
+ from fastapi.staticfiles import StaticFiles
4
+ from typing import Optional
5
+ import os
6
+ import tempfile
7
+ from pathlib import Path
8
+ from document_processor import process_document, supported_formats
9
  from transformers import pipeline
10
 
11
  app = FastAPI()
12
 
13
+ # Mount static files for CSS/JS if needed
14
+ app.mount("/static", StaticFiles(directory="static"), name="static")
15
 
16
+ @app.get("/", response_class=HTMLResponse)
17
+ async def upload_form():
18
+ return f"""
19
+ <html>
20
+ <head>
21
+ <title>Document Translation Service</title>
22
+ <style>
23
+ body {{ font-family: Arial, sans-serif; max-width: 799px; margin: 0 auto; padding: 20px; }}
24
+ .container {{ border: 1px dashed #ccc; padding: 20px; text-align: center; border-radius: 5px; }}
25
+ .form-group {{ margin-bottom: 14px; }}
26
+ select, button {{ padding: 7px 15px; font-size: 16px; }}
27
+ .supported {{ margin-top: 19px; font-size: 14px; color: #666; }}
28
+ </style>
29
+ </head>
30
+ <body>
31
+ <h1>Document Translation Service</h1>
32
+ <div class="container">
33
+ <form action="/translate" method="post" enctype="multipart/form-data">
34
+ <div class="form-group">
35
+ <input type="file" name="file" required>
36
+ </div>
37
+ <div class="form-group">
38
+ <select name="target_language">
39
+ <option value="es">Spanish</option>
40
+ <option value="fr">French</option>
41
+ <option value="de">German</option>
42
+ <option value="it">Italian</option>
43
+ <option value="pt">Portuguese</option>
44
+ </select>
45
+ </div>
46
+ <button type="submit">Translate Document</button>
47
+ </form>
48
+ </div>
49
+ <div class="supported">
50
+ <p>Supported formats: {', '.join(supported_formats())}</p>
51
+ </div>
52
+ </body>
53
+ </html>
54
+ """
55
 
56
+ @app.post("/translate")
57
+ async def translate_file(
58
+ file: UploadFile = File(...),
59
+ target_language: str = Form(...)
60
+ ):
61
+ try:
62
+ # Validate file extension
63
+ file_ext = Path(file.filename).suffix.lower()
64
+ if file_ext not in supported_formats():
65
+ raise HTTPException(
66
+ status_code=400,
67
+ detail=f"Unsupported file format. Supported formats: {', '.join(supported_formats())}"
68
+ )
69
 
70
+ # Create temp file
71
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_file:
72
+ temp_path = temp_file.name
73
+ content = await file.read()
74
+ temp_file.write(content)
75
+
76
+ # Extract text from document
77
+ extracted_text = process_document(temp_path)
78
+
79
+ # Translation model map
80
+ model_map = {
81
+ "es": "Helsinki-NLP/opus-mt-en-es",
82
+ "fr": "Helsinki-NLP/opus-mt-en-fr",
83
+ "de": "Helsinki-NLP/opus-mt-en-de",
84
+ "it": "Helsinki-NLP/opus-mt-en-it",
85
+ "pt": "Helsinki-NLP/opus-mt-en-pt"
86
+ }
87
+
88
+ model_name = model_map.get(target_language)
89
+ if not model_name:
90
+ raise HTTPException(status_code=400, detail="Unsupported target language.")
91
+
92
+ # Load appropriate translation pipeline
93
+ print(f"[DEBUG] Loading translation model: {model_name}")
94
+ pipe = pipeline("translation", model=model_name)
95
+
96
+ # Translate text
97
+ translated_text = pipe(extracted_text, max_length=1024)[0]["translation_text"]
98
+
99
+ # Save translation to text file
100
+ temp_dir = Path("./tmp")
101
+ temp_dir.mkdir(exist_ok=True)
102
+
103
+ output_filename = f"translated_{Path(file.filename).stem}.txt"
104
+ output_path = temp_dir / output_filename
105
+ with open(output_path, "w", encoding="utf-8") as f:
106
+ f.write(translated_text)
107
+
108
+ print(f"[DEBUG] Writing translation to: {output_path}")
109
+
110
+ return FileResponse(
111
+ output_path,
112
+ filename=output_filename,
113
+ media_type="text/plain"
114
+ )
115
+
116
+ except Exception as e:
117
+ raise HTTPException(status_code=500, detail=str(e))
118
+
119
+ finally:
120
+ if 'temp_path' in locals() and os.path.exists(temp_path):
121
+ os.unlink(temp_path)
122
+
123
+ @app.get("/download/{filename}", response_class=FileResponse)
124
+ def download_file(filename: str):
125
+ filepath = Path("./tmp") / filename
126
+ if not filepath.exists():
127
+ raise HTTPException(status_code=404, detail="File not found.")
128
+ return FileResponse(filepath, filename=filename)
129
+
130
+ @app.get("/health")
131
+ def health_check():
132
+ return {"status": "healthy"}
133
 
document_processor.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import PyPDF2
3
+ from docx import Document
4
+ import pptx
5
+ import pandas as pd
6
+ import tempfile
7
+
8
+ def supported_formats():
9
+ return ['.pdf', '.docx', '.pptx', '.txt', '.xlsx']
10
+
11
+ def process_document(file_path: str) -> str:
12
+ """Extract text from various document formats"""
13
+ file_ext = Path(file_path).suffix.lower()
14
+
15
+ try:
16
+ if file_ext == '.pdf':
17
+ return _extract_pdf_text(file_path)
18
+ elif file_ext == '.docx':
19
+ return _extract_docx_text(file_path)
20
+ elif file_ext == '.pptx':
21
+ return _extract_pptx_text(file_path)
22
+ elif file_ext == '.txt':
23
+ with open(file_path, 'r', encoding='utf-8') as f:
24
+ return f.read()
25
+ elif file_ext == '.xlsx':
26
+ return _extract_excel_text(file_path)
27
+ else:
28
+ raise ValueError(f"Unsupported file format: {file_ext}")
29
+ except Exception as e:
30
+ raise ValueError(f"Error processing document: {str(e)}")
31
+
32
+ def _extract_pdf_text(file_path: str) -> str:
33
+ text = ""
34
+ with open(file_path, 'rb') as f:
35
+ reader = PyPDF2.PdfReader(f)
36
+ for page in reader.pages:
37
+ text += page.extract_text() + "\n"
38
+ return text
39
+
40
+ def _extract_docx_text(file_path: str) -> str:
41
+ doc = Document(file_path)
42
+ return "\n".join([para.text for para in doc.paragraphs])
43
+
44
+ def _extract_pptx_text(file_path: str) -> str:
45
+ prs = pptx.Presentation(file_path)
46
+ text = []
47
+ for slide in prs.slides:
48
+ for shape in slide.shapes:
49
+ if hasattr(shape, "text"):
50
+ text.append(shape.text)
51
+ return "\n".join(text)
52
+
53
+ def _extract_excel_text(file_path: str) -> str:
54
+ df = pd.read_excel(file_path)
55
+ return df.to_string()
requirements.txt CHANGED
@@ -35,3 +35,9 @@ typing-inspection==0.4.0
35
  typing_extensions==4.13.1
36
  urllib3==2.3.0
37
  uvicorn==0.34.0
 
 
 
 
 
 
 
35
  typing_extensions==4.13.1
36
  urllib3==2.3.0
37
  uvicorn==0.34.0
38
+ python-multipart==0.0.9
39
+ PyPDF2==3.0.1
40
+ python-docx==1.1.0
41
+ python-pptx==0.6.23
42
+ pandas==2.2.2
43
+ openpyxl==3.1.2