BIAF-offASR / backend /document_utils.py
froster02's picture
refactor: code cleanup and App.jsx component split
3496381
import os
import logging
from docx import Document
from pptx import Presentation
import fitz # PyMuPDF
import pandas as pd
from openpyxl import load_workbook
logger = logging.getLogger(__name__)
# Lazy import for EasyOCR to avoid slow startup if not used
_easyocr_reader = None
def extract_preview_text(path, ext):
"""Extract first 1000 characters to detect language"""
try:
if ext == ".docx":
doc = Document(path)
return " ".join([p.text for p in doc.paragraphs[:5]])[:1000]
elif ext == ".pptx":
prs = Presentation(path)
text = []
for slide in prs.slides[:3]:
for shape in slide.shapes:
if hasattr(shape, "text_frame") and shape.text_frame:
text.append(shape.text_frame.text)
return " ".join(text)[:1000]
elif ext == ".pdf":
doc = fitz.open(path)
text = ""
for i in range(min(3, len(doc))):
text += doc[i].get_text()
doc.close()
return text[:1000]
elif ext == ".xlsx":
df = pd.read_excel(path, nrows=5)
return df.to_string()[:1000]
except Exception as e:
logger.warning("Preview extraction failed: %s", e)
return ""
def get_ocr_reader():
global _easyocr_reader
if _easyocr_reader is None:
import easyocr
# Support Marathi, Hindi, and English OCR
_easyocr_reader = easyocr.Reader(['hi', 'mr', 'en'], gpu=False)
return _easyocr_reader
def ocr_and_translate_page(page, model_manager, src_lang, tgt_lang):
import numpy as np
try:
pix = page.get_pixmap()
img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
reader = get_ocr_reader()
results = reader.readtext(img_data) # returns [([coords], text, prob), ...]
for (bbox_coords, text, prob) in results:
if text.strip():
translated = model_manager.translate(text, src_lang, tgt_lang)
# Convert EasyOCR bbox to fitz Rect
# EasyOCR bbox: [[x,y],[x,y],[x,y],[x,y]]
x0 = min([p[0] for p in bbox_coords])
y0 = min([p[1] for p in bbox_coords])
x1 = max([p[0] for p in bbox_coords])
y1 = max([p[1] for p in bbox_coords])
# Map image coords to PDF page coords
img_w, img_h = pix.width, pix.height
page_w, page_h = page.rect.width, page.rect.height
rect = [
x0 * page_w / img_w,
y0 * page_h / img_h,
x1 * page_w / img_w,
y1 * page_h / img_h
]
# Since it's a scan, we don't redact (the image is the background)
# Just overlay the text
page.insert_textbox(rect, translated, fontsize=10, fontname="helv")
except Exception as e:
logger.warning("OCR overlay failed: %s", e)
def translate_docx(input_path, output_path, model_manager, src_lang, tgt_lang):
doc = Document(input_path)
# Process main paragraphs
for para in doc.paragraphs:
if para.text.strip():
translated = model_manager.translate(para.text, src_lang, tgt_lang)
if para.runs:
# Preservation Strategy: Put whole translation in first run, clear others
# This keeps the 'start' styling of the paragraph
para.runs[0].text = translated
for i in range(1, len(para.runs)):
para.runs[i].text = ""
# Process tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
if para.text.strip():
translated = model_manager.translate(para.text, src_lang, tgt_lang)
if para.runs:
para.runs[0].text = translated
for i in range(1, len(para.runs)):
para.runs[i].text = ""
doc.save(output_path)
return output_path
def translate_pptx(input_path, output_path, model_manager, src_lang, tgt_lang):
prs = Presentation(input_path)
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text_frame") and shape.text_frame:
for paragraph in shape.text_frame.paragraphs:
if paragraph.text.strip():
translated = model_manager.translate(paragraph.text, src_lang, tgt_lang)
if paragraph.runs:
paragraph.runs[0].text = translated
for i in range(1, len(paragraph.runs)):
paragraph.runs[i].text = ""
if shape.has_table:
for row in shape.table.rows:
for cell in row.cells:
if cell.text_frame:
for paragraph in cell.text_frame.paragraphs:
if paragraph.text.strip():
translated = model_manager.translate(paragraph.text, src_lang, tgt_lang)
if paragraph.runs:
paragraph.runs[0].text = translated
for i in range(1, len(paragraph.runs)):
paragraph.runs[i].text = ""
prs.save(output_path)
return output_path
def translate_xlsx(input_path, output_path, model_manager, src_lang, tgt_lang):
from copy import copy
wb = load_workbook(input_path)
texts_to_translate = []
cell_refs = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
for row in ws.iter_rows():
for cell in row:
if cell.value and isinstance(cell.value, str) and cell.value.strip():
texts_to_translate.append(cell.value)
cell_refs.append((sheet_name, cell.coordinate))
if not texts_to_translate:
wb.save(output_path)
return output_path
translated_texts = model_manager.translate_batch(texts_to_translate, src_lang, tgt_lang)
for i, (sheet_name, coord) in enumerate(cell_refs):
ws = wb[sheet_name]
cell = ws[coord]
# Clone style
original_font = copy(cell.font)
original_fill = copy(cell.fill)
original_border = copy(cell.border)
original_alignment = copy(cell.alignment)
original_number_format = cell.number_format
original_protection = copy(cell.protection)
cell.value = translated_texts[i]
# Re-apply style
cell.font = original_font
cell.fill = original_fill
cell.border = original_border
cell.alignment = original_alignment
cell.number_format = original_number_format
cell.protection = original_protection
wb.save(output_path)
return output_path
def translate_pdf(input_path, output_path, model_manager, src_lang, tgt_lang):
import fitz
doc = fitz.open(input_path)
for page in doc:
# Get text blocks with coordinates
blocks = page.get_text("dict")["blocks"]
text_found = False
for b in blocks:
if "lines" in b:
text_found = True
for l in b["lines"]:
for s in l["spans"]:
original_text = s["text"]
if original_text.strip():
translated = model_manager.translate(original_text, src_lang, tgt_lang)
bbox = s["bbox"] # (x0, y0, x1, y1)
# Redact original text
page.add_redact_annot(bbox, fill=(1, 1, 1))
page.apply_redactions()
# Insert translated text
fontsize = s["size"]
# align=0 (Left), align=1 (Center), align=2 (Right)
page.insert_textbox(bbox, translated, fontsize=fontsize, fontname="helv", align=0)
# Fallback for scanned pages (Task 3 integration)
if not text_found:
ocr_and_translate_page(page, model_manager, src_lang, tgt_lang)
doc.save(output_path)
doc.close()
return output_path