Spaces:
Running
Running
File size: 8,882 Bytes
88d3f92 3496381 88d3f92 3496381 88d3f92 0acd147 3496381 0acd147 88d3f92 3496381 88d3f92 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | import os
import logging
from docx import Document
from pptx import Presentation
import fitz # PyMuPDF
import pandas as pd
from openpyxl import load_workbook
logger = logging.getLogger(__name__)
# Lazy import for EasyOCR to avoid slow startup if not used
_easyocr_reader = None
def extract_preview_text(path, ext):
"""Extract first 1000 characters to detect language"""
try:
if ext == ".docx":
doc = Document(path)
return " ".join([p.text for p in doc.paragraphs[:5]])[:1000]
elif ext == ".pptx":
prs = Presentation(path)
text = []
for slide in prs.slides[:3]:
for shape in slide.shapes:
if hasattr(shape, "text_frame") and shape.text_frame:
text.append(shape.text_frame.text)
return " ".join(text)[:1000]
elif ext == ".pdf":
doc = fitz.open(path)
text = ""
for i in range(min(3, len(doc))):
text += doc[i].get_text()
doc.close()
return text[:1000]
elif ext == ".xlsx":
df = pd.read_excel(path, nrows=5)
return df.to_string()[:1000]
except Exception as e:
logger.warning("Preview extraction failed: %s", e)
return ""
def get_ocr_reader():
global _easyocr_reader
if _easyocr_reader is None:
import easyocr
# Support Marathi, Hindi, and English OCR
_easyocr_reader = easyocr.Reader(['hi', 'mr', 'en'], gpu=False)
return _easyocr_reader
def ocr_and_translate_page(page, model_manager, src_lang, tgt_lang):
import numpy as np
try:
pix = page.get_pixmap()
img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
reader = get_ocr_reader()
results = reader.readtext(img_data) # returns [([coords], text, prob), ...]
for (bbox_coords, text, prob) in results:
if text.strip():
translated = model_manager.translate(text, src_lang, tgt_lang)
# Convert EasyOCR bbox to fitz Rect
# EasyOCR bbox: [[x,y],[x,y],[x,y],[x,y]]
x0 = min([p[0] for p in bbox_coords])
y0 = min([p[1] for p in bbox_coords])
x1 = max([p[0] for p in bbox_coords])
y1 = max([p[1] for p in bbox_coords])
# Map image coords to PDF page coords
img_w, img_h = pix.width, pix.height
page_w, page_h = page.rect.width, page.rect.height
rect = [
x0 * page_w / img_w,
y0 * page_h / img_h,
x1 * page_w / img_w,
y1 * page_h / img_h
]
# Since it's a scan, we don't redact (the image is the background)
# Just overlay the text
page.insert_textbox(rect, translated, fontsize=10, fontname="helv")
except Exception as e:
logger.warning("OCR overlay failed: %s", e)
def translate_docx(input_path, output_path, model_manager, src_lang, tgt_lang):
doc = Document(input_path)
# Process main paragraphs
for para in doc.paragraphs:
if para.text.strip():
translated = model_manager.translate(para.text, src_lang, tgt_lang)
if para.runs:
# Preservation Strategy: Put whole translation in first run, clear others
# This keeps the 'start' styling of the paragraph
para.runs[0].text = translated
for i in range(1, len(para.runs)):
para.runs[i].text = ""
# Process tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
if para.text.strip():
translated = model_manager.translate(para.text, src_lang, tgt_lang)
if para.runs:
para.runs[0].text = translated
for i in range(1, len(para.runs)):
para.runs[i].text = ""
doc.save(output_path)
return output_path
def translate_pptx(input_path, output_path, model_manager, src_lang, tgt_lang):
prs = Presentation(input_path)
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text_frame") and shape.text_frame:
for paragraph in shape.text_frame.paragraphs:
if paragraph.text.strip():
translated = model_manager.translate(paragraph.text, src_lang, tgt_lang)
if paragraph.runs:
paragraph.runs[0].text = translated
for i in range(1, len(paragraph.runs)):
paragraph.runs[i].text = ""
if shape.has_table:
for row in shape.table.rows:
for cell in row.cells:
if cell.text_frame:
for paragraph in cell.text_frame.paragraphs:
if paragraph.text.strip():
translated = model_manager.translate(paragraph.text, src_lang, tgt_lang)
if paragraph.runs:
paragraph.runs[0].text = translated
for i in range(1, len(paragraph.runs)):
paragraph.runs[i].text = ""
prs.save(output_path)
return output_path
def translate_xlsx(input_path, output_path, model_manager, src_lang, tgt_lang):
from copy import copy
wb = load_workbook(input_path)
texts_to_translate = []
cell_refs = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
for row in ws.iter_rows():
for cell in row:
if cell.value and isinstance(cell.value, str) and cell.value.strip():
texts_to_translate.append(cell.value)
cell_refs.append((sheet_name, cell.coordinate))
if not texts_to_translate:
wb.save(output_path)
return output_path
translated_texts = model_manager.translate_batch(texts_to_translate, src_lang, tgt_lang)
for i, (sheet_name, coord) in enumerate(cell_refs):
ws = wb[sheet_name]
cell = ws[coord]
# Clone style
original_font = copy(cell.font)
original_fill = copy(cell.fill)
original_border = copy(cell.border)
original_alignment = copy(cell.alignment)
original_number_format = cell.number_format
original_protection = copy(cell.protection)
cell.value = translated_texts[i]
# Re-apply style
cell.font = original_font
cell.fill = original_fill
cell.border = original_border
cell.alignment = original_alignment
cell.number_format = original_number_format
cell.protection = original_protection
wb.save(output_path)
return output_path
def translate_pdf(input_path, output_path, model_manager, src_lang, tgt_lang):
import fitz
doc = fitz.open(input_path)
for page in doc:
# Get text blocks with coordinates
blocks = page.get_text("dict")["blocks"]
text_found = False
for b in blocks:
if "lines" in b:
text_found = True
for l in b["lines"]:
for s in l["spans"]:
original_text = s["text"]
if original_text.strip():
translated = model_manager.translate(original_text, src_lang, tgt_lang)
bbox = s["bbox"] # (x0, y0, x1, y1)
# Redact original text
page.add_redact_annot(bbox, fill=(1, 1, 1))
page.apply_redactions()
# Insert translated text
fontsize = s["size"]
# align=0 (Left), align=1 (Center), align=2 (Right)
page.insert_textbox(bbox, translated, fontsize=fontsize, fontname="helv", align=0)
# Fallback for scanned pages (Task 3 integration)
if not text_found:
ocr_and_translate_page(page, model_manager, src_lang, tgt_lang)
doc.save(output_path)
doc.close()
return output_path
|