mistpe commited on
Commit
50fc7b7
·
verified ·
1 Parent(s): 2920df4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +570 -0
app.py ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Form, Body
3
+ from fastapi.staticfiles import StaticFiles
4
+ from fastapi.responses import StreamingResponse, FileResponse
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ import uvicorn
7
+ import asyncio
8
+ import aiohttp
9
+ import json
10
+ import webbrowser
11
+ from typing import List, Dict, Optional
12
+ import os
13
+ from pathlib import Path
14
+ import pypdf
15
+ from docx import Document
16
+ from docx.shared import Inches, Pt
17
+ import markdown
18
+ import base64
19
+ from pydantic import BaseModel
20
+ import threading
21
+ from typing import List, Optional
22
+ import time
23
+ import hashlib
24
+ import re
25
+ from io import BytesIO
26
+
27
+ # 创建必要的目录
28
+ os.makedirs("static", exist_ok=True)
29
+ os.makedirs("temp", exist_ok=True)
30
+ os.makedirs("translation_memory", exist_ok=True)
31
+
32
+ app = FastAPI()
33
+
34
+ # 配置CORS
35
+ app.add_middleware(
36
+ CORSMiddleware,
37
+ allow_origins=["*"],
38
+ allow_credentials=True,
39
+ allow_methods=["*"],
40
+ allow_headers=["*"],
41
+ )
42
+
43
+ # 挂载静态文件目录
44
+ app.mount("/static", StaticFiles(directory="static"), name="static")
45
+
46
+ class DocumentSegment(BaseModel):
47
+ text: str
48
+ type: str
49
+ format: Dict
50
+ position: Dict
51
+ translated: str = ""
52
+ alternatives: List[str] = []
53
+ terminology: Dict[str, str] = {}
54
+ confidence: float = 0.0
55
+ review_status: str = "pending"
56
+
57
+ class TranslationRequest(BaseModel):
58
+ text: str
59
+ source_lang: str
60
+ target_lang: str
61
+ use_memory: bool = True
62
+ use_terminology: bool = True
63
+
64
+ class ExportRequest(BaseModel):
65
+ segments: List[DocumentSegment]
66
+ format: str
67
+ mode: str
68
+ source_file_type: str
69
+
70
+ class TranslationMemory:
71
+ def __init__(self):
72
+ self.memory_file = "translation_memory/memory.json"
73
+ self.load_memory()
74
+
75
+ def load_memory(self):
76
+ if os.path.exists(self.memory_file):
77
+ with open(self.memory_file, 'r', encoding='utf-8') as f:
78
+ self.memory = json.load(f)
79
+ else:
80
+ self.memory = {}
81
+ self.save_memory()
82
+
83
+ def save_memory(self):
84
+ with open(self.memory_file, 'w', encoding='utf-8') as f:
85
+ json.dump(self.memory, f, ensure_ascii=False, indent=2)
86
+
87
+ def get_translation(self, text: str, source_lang: str, target_lang: str) -> Optional[str]:
88
+ key = f"{source_lang}_{target_lang}_{hashlib.md5(text.encode()).hexdigest()}"
89
+ return self.memory.get(key, {}).get('translation')
90
+
91
+ def add_translation(self, text: str, translation: str, source_lang: str, target_lang: str):
92
+ key = f"{source_lang}_{target_lang}_{hashlib.md5(text.encode()).hexdigest()}"
93
+ self.memory[key] = {
94
+ 'text': text,
95
+ 'translation': translation,
96
+ 'timestamp': time.time()
97
+ }
98
+ self.save_memory()
99
+
100
+ class TerminologyManager:
101
+ def __init__(self):
102
+ self.terminology_file = "translation_memory/terminology.json"
103
+ self.load_terminology()
104
+
105
+ def load_terminology(self):
106
+ if os.path.exists(self.terminology_file):
107
+ with open(self.terminology_file, 'r', encoding='utf-8') as f:
108
+ self.terminology = json.load(f)
109
+ else:
110
+ self.terminology = {}
111
+ self.save_terminology()
112
+
113
+ def save_terminology(self):
114
+ with open(self.terminology_file, 'w', encoding='utf-8') as f:
115
+ json.dump(self.terminology, f, ensure_ascii=False, indent=2)
116
+
117
+ def get_terminology(self, source_lang: str, target_lang: str) -> Dict[str, str]:
118
+ key = f"{source_lang}_{target_lang}"
119
+ return self.terminology.get(key, {})
120
+
121
+ def add_term(self, source_term: str, target_term: str, source_lang: str, target_lang: str):
122
+ key = f"{source_lang}_{target_lang}"
123
+ if key not in self.terminology:
124
+ self.terminology[key] = {}
125
+ self.terminology[key][source_term] = target_term
126
+ self.save_terminology()
127
+
128
+ class DocumentProcessor:
129
+ @staticmethod
130
+ async def extract_text(file: UploadFile) -> List[DocumentSegment]:
131
+ content = await file.read()
132
+ file_ext = file.filename.split('.')[-1].lower()
133
+ segments = []
134
+
135
+ if file_ext == 'txt':
136
+ text = content.decode('utf-8')
137
+ # 优化分段逻辑,支持更多分隔符
138
+ paragraphs = re.split(r'\n\s*\n|\r\n\s*\r\n', text)
139
+ for i, para in enumerate(paragraphs):
140
+ if para.strip():
141
+ segments.append(DocumentSegment(
142
+ text=para.strip(),
143
+ type='paragraph',
144
+ format={'font': 'default', 'style': 'normal'},
145
+ position={'index': i}
146
+ ))
147
+
148
+ elif file_ext == 'pdf':
149
+ temp_path = f"temp/{file.filename}"
150
+ with open(temp_path, 'wb') as f:
151
+ f.write(content)
152
+
153
+ reader = pypdf.PdfReader(temp_path)
154
+ current_font = None
155
+ current_size = None
156
+
157
+ for i, page in enumerate(reader.pages):
158
+ text = page.extract_text()
159
+ paragraphs = text.split('\n\n')
160
+ for j, para in enumerate(paragraphs):
161
+ if para.strip():
162
+ segments.append(DocumentSegment(
163
+ text=para.strip(),
164
+ type='paragraph',
165
+ format={
166
+ 'font': current_font or 'default',
167
+ 'size': current_size or 12,
168
+ 'page': i + 1
169
+ },
170
+ position={'page': i, 'index': j}
171
+ ))
172
+ os.remove(temp_path)
173
+
174
+ elif file_ext == 'docx':
175
+ temp_path = f"temp/{file.filename}"
176
+ with open(temp_path, 'wb') as f:
177
+ f.write(content)
178
+
179
+ doc = Document(temp_path)
180
+ for i, para in enumerate(doc.paragraphs):
181
+ if para.text.strip():
182
+ format_info = {
183
+ 'style': para.style.name,
184
+ 'alignment': str(para.alignment),
185
+ 'font': para.style.font.name if para.style.font else 'default',
186
+ 'size': para.style.font.size if para.style.font else 12,
187
+ 'bold': any(run.bold for run in para.runs),
188
+ 'italic': any(run.italic for run in para.runs)
189
+ }
190
+
191
+ segments.append(DocumentSegment(
192
+ text=para.text.strip(),
193
+ type='paragraph' if not para.style.name.startswith('Heading') else 'heading',
194
+ format=format_info,
195
+ position={'index': i}
196
+ ))
197
+ os.remove(temp_path)
198
+
199
+ elif file_ext == 'md':
200
+ text = content.decode('utf-8')
201
+ lines = text.split('\n')
202
+ current_segment = []
203
+ for i, line in enumerate(lines):
204
+ if line.strip():
205
+ current_segment.append(line)
206
+ elif current_segment:
207
+ segment_text = '\n'.join(current_segment)
208
+ format_info = {
209
+ 'type': 'markdown',
210
+ 'headings': bool(re.match(r'^#+\s', segment_text)),
211
+ 'lists': bool(re.match(r'^[-*+]\s', segment_text)),
212
+ 'code': bool(re.match(r'^```', segment_text))
213
+ }
214
+ segments.append(DocumentSegment(
215
+ text=segment_text,
216
+ type='markdown',
217
+ format=format_info,
218
+ position={'index': len(segments)}
219
+ ))
220
+ current_segment = []
221
+
222
+ if current_segment:
223
+ segment_text = '\n'.join(current_segment)
224
+ segments.append(DocumentSegment(
225
+ text=segment_text,
226
+ type='markdown',
227
+ format={'type': 'markdown'},
228
+ position={'index': len(segments)}
229
+ ))
230
+
231
+ return segments
232
+
233
+ class DocumentExporter:
234
+ def __init__(self, segments, source_file_type):
235
+ self.segments = segments
236
+ self.source_file_type = source_file_type
237
+
238
+ def export_txt(self, mode='translated'):
239
+ if mode == 'translated':
240
+ content = '\n\n'.join(seg['translated'] or seg['text'] for seg in self.segments)
241
+ else: # 对照模式
242
+ content = ''
243
+ for seg in self.segments:
244
+ content += f"原文:{seg['text']}\n"
245
+ content += f"译文:{seg['translated']}\n"
246
+ content += f"{'=' * 50}\n\n"
247
+ return content.encode('utf-8')
248
+
249
+ def export_docx(self, mode='translated'):
250
+ doc = Document()
251
+ section = doc.sections[0]
252
+ section.page_width = Inches(11.69) # A4 width
253
+ section.page_height = Inches(8.27) # A4 height
254
+
255
+ if mode == 'translated':
256
+ for seg in self.segments:
257
+ p = doc.add_paragraph()
258
+ if seg['format'].get('style'):
259
+ try:
260
+ p.style = seg['format']['style']
261
+ except:
262
+ pass
263
+ p.add_run(seg['translated'] or seg['text'])
264
+
265
+ else: # 对照模式
266
+ table = doc.add_table(rows=1, cols=2)
267
+ table.style = 'Table Grid'
268
+ header_cells = table.rows[0].cells
269
+ header_cells[0].text = '原文'
270
+ header_cells[1].text = '译文'
271
+
272
+ for seg in self.segments:
273
+ row_cells = table.add_row().cells
274
+ row_cells[0].text = seg['text']
275
+ row_cells[1].text = seg['translated'] or ''
276
+
277
+ if seg['format'].get('style'):
278
+ try:
279
+ for cell in row_cells:
280
+ cell.paragraphs[0].style = seg['format']['style']
281
+ except:
282
+ pass
283
+
284
+ # 保存到临时BytesIO
285
+ temp_bio = BytesIO()
286
+ doc.save(temp_bio)
287
+ return temp_bio.getvalue()
288
+
289
+ def export_markdown(self, mode='translated'):
290
+ if mode == 'translated':
291
+ content = []
292
+ for seg in self.segments:
293
+ if seg['format'].get('type') == 'markdown':
294
+ content.append(seg['translated'] or seg['text'])
295
+ else:
296
+ content.append(seg['translated'] or seg['text'])
297
+ return '\n\n'.join(content).encode('utf-8')
298
+ else:
299
+ content = []
300
+ for seg in self.segments:
301
+ content.append('### 原文\n')
302
+ content.append(seg['text'])
303
+ content.append('\n### 译文\n')
304
+ content.append(seg['translated'] or '')
305
+ content.append('\n---\n')
306
+ return '\n'.join(content).encode('utf-8')
307
+
308
+ def export_html(self, mode='translated'):
309
+ css = """
310
+ <style>
311
+ .translation-wrapper { max-width: 1200px; margin: 0 auto; padding: 20px; }
312
+ .segment { margin-bottom: 20px; }
313
+ .parallel { display: flex; gap: 20px; }
314
+ .source, .target { flex: 1; padding: 10px; background: #f9f9f9; border-radius: 4px; }
315
+ h3 { color: #666; font-size: 0.9em; margin-bottom: 5px; }
316
+ </style>
317
+ """
318
+
319
+ if mode == 'translated':
320
+ content = [
321
+ '<!DOCTYPE html><html><head><meta charset="UTF-8">',
322
+ css,
323
+ '</head><body><div class="translation-wrapper">'
324
+ ]
325
+
326
+ for seg in self.segments:
327
+ content.append(f'<div class="segment">{seg["translated"] or seg["text"]}</div>')
328
+
329
+ content.append('</div></body></html>')
330
+
331
+ else: # 对照模式
332
+ content = [
333
+ '<!DOCTYPE html><html><head><meta charset="UTF-8">',
334
+ css,
335
+ '</head><body><div class="translation-wrapper">'
336
+ ]
337
+
338
+ for seg in self.segments:
339
+ content.append('<div class="segment parallel">')
340
+ content.append(f'<div class="source"><h3>原文</h3>{seg["text"]}</div>')
341
+ content.append(
342
+ f'<div class="target"><h3>译文</h3>{seg["translated"] or ""}</div>'
343
+ )
344
+ content.append('</div>')
345
+
346
+ content.append('</div></body></html>')
347
+
348
+ return '\n'.join(content).encode('utf-8')
349
+
350
+ def export(self, format='auto', mode='translated'):
351
+ if format == 'auto':
352
+ format = self.source_file_type or 'txt'
353
+
354
+ if format == 'txt':
355
+ return {
356
+ 'content': self.export_txt(mode),
357
+ 'mimetype': 'text/plain',
358
+ 'extension': 'txt'
359
+ }
360
+ elif format == 'docx':
361
+ return {
362
+ 'content': self.export_docx(mode),
363
+ 'mimetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
364
+ 'extension': 'docx'
365
+ }
366
+ elif format == 'md':
367
+ return {
368
+ 'content': self.export_markdown(mode),
369
+ 'mimetype': 'text/markdown',
370
+ 'extension': 'md'
371
+ }
372
+ elif format == 'html':
373
+ return {
374
+ 'content': self.export_html(mode),
375
+ 'mimetype': 'text/html',
376
+ 'extension': 'html'
377
+ }
378
+ else:
379
+ raise ValueError(f'Unsupported format: {format}')
380
+
381
+ class TranslationManager:
382
+ def __init__(self):
383
+ self.memory = TranslationMemory()
384
+ self.terminology = TerminologyManager()
385
+
386
+ async def translate_text(self, text: str, source_lang: str = "AUTO", target_lang: str = "ZH") -> Dict:
387
+ # 首先检查翻译记忆
388
+ memory_translation = self.memory.get_translation(text, source_lang, target_lang)
389
+ if memory_translation:
390
+ return {
391
+ 'translated': memory_translation,
392
+ 'alternatives': [],
393
+ 'from_memory': True
394
+ }
395
+
396
+ # 应用术语库替换
397
+ terms = self.terminology.get_terminology(source_lang, target_lang)
398
+ text_to_translate = text
399
+ replacements = {}
400
+ for source_term, target_term in terms.items():
401
+ if source_term in text_to_translate:
402
+ placeholder = f"__TERM_{len(replacements)}__"
403
+ replacements[placeholder] = target_term
404
+ text_to_translate = text_to_translate.replace(source_term, placeholder)
405
+
406
+ # async with aiohttp.ClientSession() as session:
407
+ # try:
408
+ # # DeepL翻译
409
+ # async with session.post(
410
+ # 'https://api.deeplx.org/..../translate',
411
+ # json={
412
+ # "text": text_to_translate,
413
+ # "source_lang": source_lang,
414
+ # "target_lang": target_lang
415
+ # }
416
+ # ) as response:
417
+ # result = await response.json()
418
+ # if result.get('code') == 200:
419
+ # translated_text = result['data']
420
+ # # 恢复术语替换
421
+ # for placeholder, term in replacements.items():
422
+ # translated_text = translated_text.replace(placeholder, term)
423
+
424
+ # # 保存到翻译记忆
425
+ # self.memory.add_translation(text, translated_text, source_lang, target_lang)
426
+
427
+ # return {
428
+ # 'translated': translated_text,
429
+ # 'alternatives': result.get('alternatives', []),
430
+ # 'from_memory': False,
431
+ # 'confidence': 0.8 if replacements else 0.7
432
+ # }
433
+ # else:
434
+ # raise HTTPException(status_code=500, detail="Translation API error")
435
+ # except Exception as e:
436
+ # raise HTTPException(status_code=500, detail=str(e))
437
+
438
+
439
+ async with aiohttp.ClientSession() as session:
440
+ try:
441
+ # 使用环境变量获取Deepl翻译API的URL
442
+ deepl_api_url = os.environ.get('DEEPL_API_URL')
443
+ if not deepl_api_url:
444
+ raise ValueError("DEEPL_API_URL environment variable is not set.")
445
+
446
+ async with session.post(
447
+ deepl_api_url,
448
+ json={
449
+ "text": text_to_translate,
450
+ "source_lang": source_lang,
451
+ "target_lang": target_lang
452
+ }
453
+ ) as response:
454
+ result = await response.json()
455
+ if result.get('code') == 200:
456
+ translated_text = result['data']
457
+ # 恢复术语替换
458
+ for placeholder, term in replacements.items():
459
+ translated_text = translated_text.replace(placeholder, term)
460
+
461
+ # 保存到翻译记忆
462
+ self.memory.add_translation(text, translated_text, source_lang, target_lang)
463
+
464
+ return {
465
+ 'translated': translated_text,
466
+ 'alternatives': result.get('alternatives', []),
467
+ 'from_memory': False,
468
+ 'confidence': 0.8 if replacements else 0.7
469
+ }
470
+ else:
471
+ raise HTTPException(status_code=500, detail="Translation API error")
472
+ except Exception as e:
473
+ raise HTTPException(status_code=500, detail=str(e))
474
+
475
+ async def translate_segments(self, segments: List[DocumentSegment], source_lang: str, target_lang: str) -> List[DocumentSegment]:
476
+ translated_segments = []
477
+ for segment in segments:
478
+ if not segment.translated: # 只翻译未翻译的段落
479
+ result = await self.translate_text(segment.text, source_lang, target_lang)
480
+ segment.translated = result['translated']
481
+ segment.alternatives = result['alternatives']
482
+ segment.confidence = result.get('confidence', 0.7)
483
+ segment.review_status = 'from_memory' if result.get('from_memory') else 'machine_translated'
484
+ await asyncio.sleep(1) # 控制请求频率
485
+ translated_segments.append(segment)
486
+ return translated_segments
487
+
488
+ # 全局翻译管理器实例
489
+ translation_manager = TranslationManager()
490
+
491
+ @app.post("/upload")
492
+ async def upload_file(
493
+ file: UploadFile = File(...),
494
+ source_lang: str = Form("AUTO"),
495
+ target_lang: str = Form("ZH")
496
+ ):
497
+ processor = DocumentProcessor()
498
+ segments = await processor.extract_text(file)
499
+ # 记录原始文件类型
500
+ file_type = file.filename.split('.')[-1].lower()
501
+ return {
502
+ "segments": [seg.dict() for seg in segments],
503
+ "source_file_type": file_type
504
+ }
505
+
506
+ @app.post("/translate")
507
+ async def translate(
508
+ segments: List[DocumentSegment],
509
+ source_lang: str = Body("AUTO"),
510
+ target_lang: str = Body("ZH")
511
+ ):
512
+ translated_segments = await translation_manager.translate_segments(segments, source_lang, target_lang)
513
+ return {"segments": [seg.dict() for seg in translated_segments]}
514
+
515
+ @app.post("/translate_text")
516
+ async def translate_text(request: TranslationRequest):
517
+ result = await translation_manager.translate_text(
518
+ request.text,
519
+ request.source_lang,
520
+ request.target_lang
521
+ )
522
+ return result
523
+
524
+ @app.post("/add_term")
525
+ async def add_term(
526
+ source_term: str = Form(...),
527
+ target_term: str = Form(...),
528
+ source_lang: str = Form(...),
529
+ target_lang: str = Form(...)
530
+ ):
531
+ translation_manager.terminology.add_term(source_term, target_term, source_lang, target_lang)
532
+ return {"status": "success"}
533
+
534
+ @app.get("/get_terminology")
535
+ async def get_terminology(source_lang: str, target_lang: str):
536
+ terms = translation_manager.terminology.get_terminology(source_lang, target_lang)
537
+ return {"terminology": terms}
538
+
539
+ @app.post("/export")
540
+ async def export_document(request: ExportRequest):
541
+ try:
542
+ exporter = DocumentExporter(
543
+ [seg.dict() for seg in request.segments],
544
+ request.source_file_type
545
+ )
546
+ result = exporter.export(request.format, request.mode)
547
+
548
+ return StreamingResponse(
549
+ BytesIO(result['content']),
550
+ media_type=result['mimetype'],
551
+ headers={
552
+ 'Content-Disposition': f'attachment; filename=translated_document.{result["extension"]}'
553
+ }
554
+ )
555
+ except Exception as e:
556
+ raise HTTPException(status_code=500, detail=str(e))
557
+
558
+ @app.get("/")
559
+ async def read_root():
560
+ return FileResponse('static/index.html')
561
+
562
+ def open_browser():
563
+ webbrowser.open('http://localhost:7860')
564
+
565
+ if __name__ == "__main__":
566
+ # 启动浏览器
567
+ threading.Timer(1.5, open_browser).start()
568
+
569
+ # 启动FastAPI服务
570
+ uvicorn.run(app, host="0.0.0.0", port=7860)