Spaces:
Running
Running
| import os | |
| import logging | |
| from docx import Document | |
| from pptx import Presentation | |
| import fitz # PyMuPDF | |
| import pandas as pd | |
| from openpyxl import load_workbook | |
| logger = logging.getLogger(__name__) | |
| # Lazy import for EasyOCR to avoid slow startup if not used | |
| _easyocr_reader = None | |
| def extract_preview_text(path, ext): | |
| """Extract first 1000 characters to detect language""" | |
| try: | |
| if ext == ".docx": | |
| doc = Document(path) | |
| return " ".join([p.text for p in doc.paragraphs[:5]])[:1000] | |
| elif ext == ".pptx": | |
| prs = Presentation(path) | |
| text = [] | |
| for slide in prs.slides[:3]: | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text_frame") and shape.text_frame: | |
| text.append(shape.text_frame.text) | |
| return " ".join(text)[:1000] | |
| elif ext == ".pdf": | |
| doc = fitz.open(path) | |
| text = "" | |
| for i in range(min(3, len(doc))): | |
| text += doc[i].get_text() | |
| doc.close() | |
| return text[:1000] | |
| elif ext == ".xlsx": | |
| df = pd.read_excel(path, nrows=5) | |
| return df.to_string()[:1000] | |
| except Exception as e: | |
| logger.warning("Preview extraction failed: %s", e) | |
| return "" | |
| def get_ocr_reader(): | |
| global _easyocr_reader | |
| if _easyocr_reader is None: | |
| import easyocr | |
| # Support Marathi, Hindi, and English OCR | |
| _easyocr_reader = easyocr.Reader(['hi', 'mr', 'en'], gpu=False) | |
| return _easyocr_reader | |
| def ocr_and_translate_page(page, model_manager, src_lang, tgt_lang): | |
| import numpy as np | |
| try: | |
| pix = page.get_pixmap() | |
| img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n) | |
| reader = get_ocr_reader() | |
| results = reader.readtext(img_data) # returns [([coords], text, prob), ...] | |
| for (bbox_coords, text, prob) in results: | |
| if text.strip(): | |
| translated = model_manager.translate(text, src_lang, tgt_lang) | |
| # Convert EasyOCR bbox to fitz Rect | |
| # EasyOCR bbox: [[x,y],[x,y],[x,y],[x,y]] | |
| x0 = min([p[0] for p in bbox_coords]) | |
| y0 = min([p[1] for p in bbox_coords]) | |
| x1 = max([p[0] for p in bbox_coords]) | |
| y1 = max([p[1] for p in bbox_coords]) | |
| # Map image coords to PDF page coords | |
| img_w, img_h = pix.width, pix.height | |
| page_w, page_h = page.rect.width, page.rect.height | |
| rect = [ | |
| x0 * page_w / img_w, | |
| y0 * page_h / img_h, | |
| x1 * page_w / img_w, | |
| y1 * page_h / img_h | |
| ] | |
| # Since it's a scan, we don't redact (the image is the background) | |
| # Just overlay the text | |
| page.insert_textbox(rect, translated, fontsize=10, fontname="helv") | |
| except Exception as e: | |
| logger.warning("OCR overlay failed: %s", e) | |
| def translate_docx(input_path, output_path, model_manager, src_lang, tgt_lang): | |
| doc = Document(input_path) | |
| # Process main paragraphs | |
| for para in doc.paragraphs: | |
| if para.text.strip(): | |
| translated = model_manager.translate(para.text, src_lang, tgt_lang) | |
| if para.runs: | |
| # Preservation Strategy: Put whole translation in first run, clear others | |
| # This keeps the 'start' styling of the paragraph | |
| para.runs[0].text = translated | |
| for i in range(1, len(para.runs)): | |
| para.runs[i].text = "" | |
| # Process tables | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| for para in cell.paragraphs: | |
| if para.text.strip(): | |
| translated = model_manager.translate(para.text, src_lang, tgt_lang) | |
| if para.runs: | |
| para.runs[0].text = translated | |
| for i in range(1, len(para.runs)): | |
| para.runs[i].text = "" | |
| doc.save(output_path) | |
| return output_path | |
| def translate_pptx(input_path, output_path, model_manager, src_lang, tgt_lang): | |
| prs = Presentation(input_path) | |
| for slide in prs.slides: | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text_frame") and shape.text_frame: | |
| for paragraph in shape.text_frame.paragraphs: | |
| if paragraph.text.strip(): | |
| translated = model_manager.translate(paragraph.text, src_lang, tgt_lang) | |
| if paragraph.runs: | |
| paragraph.runs[0].text = translated | |
| for i in range(1, len(paragraph.runs)): | |
| paragraph.runs[i].text = "" | |
| if shape.has_table: | |
| for row in shape.table.rows: | |
| for cell in row.cells: | |
| if cell.text_frame: | |
| for paragraph in cell.text_frame.paragraphs: | |
| if paragraph.text.strip(): | |
| translated = model_manager.translate(paragraph.text, src_lang, tgt_lang) | |
| if paragraph.runs: | |
| paragraph.runs[0].text = translated | |
| for i in range(1, len(paragraph.runs)): | |
| paragraph.runs[i].text = "" | |
| prs.save(output_path) | |
| return output_path | |
| def translate_xlsx(input_path, output_path, model_manager, src_lang, tgt_lang): | |
| from copy import copy | |
| wb = load_workbook(input_path) | |
| texts_to_translate = [] | |
| cell_refs = [] | |
| for sheet_name in wb.sheetnames: | |
| ws = wb[sheet_name] | |
| for row in ws.iter_rows(): | |
| for cell in row: | |
| if cell.value and isinstance(cell.value, str) and cell.value.strip(): | |
| texts_to_translate.append(cell.value) | |
| cell_refs.append((sheet_name, cell.coordinate)) | |
| if not texts_to_translate: | |
| wb.save(output_path) | |
| return output_path | |
| translated_texts = model_manager.translate_batch(texts_to_translate, src_lang, tgt_lang) | |
| for i, (sheet_name, coord) in enumerate(cell_refs): | |
| ws = wb[sheet_name] | |
| cell = ws[coord] | |
| # Clone style | |
| original_font = copy(cell.font) | |
| original_fill = copy(cell.fill) | |
| original_border = copy(cell.border) | |
| original_alignment = copy(cell.alignment) | |
| original_number_format = cell.number_format | |
| original_protection = copy(cell.protection) | |
| cell.value = translated_texts[i] | |
| # Re-apply style | |
| cell.font = original_font | |
| cell.fill = original_fill | |
| cell.border = original_border | |
| cell.alignment = original_alignment | |
| cell.number_format = original_number_format | |
| cell.protection = original_protection | |
| wb.save(output_path) | |
| return output_path | |
| def translate_pdf(input_path, output_path, model_manager, src_lang, tgt_lang): | |
| import fitz | |
| doc = fitz.open(input_path) | |
| for page in doc: | |
| # Get text blocks with coordinates | |
| blocks = page.get_text("dict")["blocks"] | |
| text_found = False | |
| for b in blocks: | |
| if "lines" in b: | |
| text_found = True | |
| for l in b["lines"]: | |
| for s in l["spans"]: | |
| original_text = s["text"] | |
| if original_text.strip(): | |
| translated = model_manager.translate(original_text, src_lang, tgt_lang) | |
| bbox = s["bbox"] # (x0, y0, x1, y1) | |
| # Redact original text | |
| page.add_redact_annot(bbox, fill=(1, 1, 1)) | |
| page.apply_redactions() | |
| # Insert translated text | |
| fontsize = s["size"] | |
| # align=0 (Left), align=1 (Center), align=2 (Right) | |
| page.insert_textbox(bbox, translated, fontsize=fontsize, fontname="helv", align=0) | |
| # Fallback for scanned pages (Task 3 integration) | |
| if not text_found: | |
| ocr_and_translate_page(page, model_manager, src_lang, tgt_lang) | |
| doc.save(output_path) | |
| doc.close() | |
| return output_path | |