OmniFile-Processor / src /export.py
Dr. Abdulmalek
deploy: OmniFile AI Processor v4.3.0
900df0b
"""
HandwrittenOCR - تصدير بيانات التدريب ورفع إلى HuggingFace v4.0
====================================================================
المحسنات:
- auto_export(): CSV + XLSX + نص كامل + JSONL تدريب
- create_backup(): نسخ احتياطي شامل
- push_to_huggingface(): مع commit_message يحتوي التاريخ
"""
import os
import json
import random
import shutil
import tempfile
import logging
from datetime import datetime
from pathlib import Path
from src.logger import log_step
import pandas as pd
logger = logging.getLogger("HandwrittenOCR")
def auto_export(
db,
run_id: str,
output_dir: str = None,
config=None,
) -> dict:
"""
تصدير تلقائي شامل: CSV + XLSX + النص الكامل + JSONL تدريب.
Parameters:
db: كائن قاعدة البيانات
run_id: معرف التشغيل
output_dir: مجلد الإخراج (اختياري)
config: كائن الإعدادات (اختياري)
Returns:
ملخص التصدير {files, total_words, verified, ...}
"""
if output_dir is None:
if config:
output_dir = os.path.join(config.exports_dir, "auto", run_id)
else:
output_dir = os.path.join("exports", "auto", run_id)
os.makedirs(output_dir, exist_ok=True)
# جلب البيانات
words = db.get_all()
if not words:
logger.warning("لا توجد بيانات للتصدير")
return {}
df_all = pd.DataFrame(words)
df_verified = df_all[
df_all["status"].isin(["verified", "sentence_corrected"])
]
df_csv = df_all.drop(columns=["image_data"], errors="ignore")
exported = {}
# --- CSV ---
csv_path = os.path.join(output_dir, "all_words.csv")
df_csv.to_csv(csv_path, index=False, encoding="utf-8-sig")
exported["csv"] = csv_path
# --- XLSX (مع ورقة لكل صفحة) ---
try:
xlsx_path = os.path.join(output_dir, "all_words.xlsx")
with pd.ExcelWriter(xlsx_path, engine="openpyxl") as writer:
df_csv.to_excel(writer, sheet_name="All", index=False)
for pg in sorted(df_csv["page_num"].dropna().unique()):
page_df = df_csv[df_csv["page_num"] == pg]
page_df.to_excel(writer, sheet_name=f"P{int(pg)}", index=False)
exported["xlsx"] = xlsx_path
except ImportError:
logger.warning("openpyxl غير مثبت - تخطي XLSX")
# --- النص الكامل المُعاد بناؤه ---
try:
from src.reconstruction import reconstruct_sentences_direct
text_lines = reconstruct_sentences_direct(df_all)
text_path = os.path.join(output_dir, "reconstructed_text.txt")
with open(text_path, "w", encoding="utf-8") as f:
f.write("\n".join(text_lines))
exported["text"] = text_path
except Exception as e:
logger.warning(f"فشل إعادة بناء النص: {e}")
# --- JSONL للتدريب ---
if not df_verified.empty:
img_dir = os.path.join(output_dir, "training_images")
os.makedirs(img_dir, exist_ok=True)
records = []
for _, row in df_verified.iterrows():
fname = f"img_{row['image_id']}.png"
with open(os.path.join(img_dir, fname), "wb") as f:
f.write(row["image_data"])
txt = (row["predicted_text"] or "").strip()
if txt:
records.append({"image": fname, "text": txt})
jsonl_path = os.path.join(output_dir, "training_data.jsonl")
with open(jsonl_path, "w", encoding="utf-8") as f:
for rec in records:
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
exported["jsonl"] = jsonl_path
exported["training_samples"] = len(records)
summary = {
"run_id": run_id,
"exported_at": datetime.now().isoformat(),
"total_words": len(df_all),
"verified": len(df_verified),
"dir": output_dir,
"files": exported,
}
summary_path = os.path.join(output_dir, "export_summary.json")
with open(summary_path, "w", encoding="utf-8") as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
logger.info(f"تم التصدير التلقائي: {output_dir}")
return summary
def export_finetuning_dataset(
db,
output_dir: str,
val_ratio: float = 0.1,
) -> str | None:
"""
تصدير البيانات الموثقة كبيانات تدريب JSONL مع train/val split.
Parameters:
db: كائن قاعدة البيانات
output_dir: مجلد الإخراج
val_ratio: نسبة بيانات التحقق
Returns:
مسار مجلد الإخراج أو None
"""
verified = db.get_verified()
verified = [
w for w in verified
if w.get("status") in ("verified", "sentence_corrected")
]
if not verified:
logger.warning("لا توجد بيانات موثقة للتصدير")
return None
os.makedirs(output_dir, exist_ok=True)
img_dir = os.path.join(output_dir, "images")
os.makedirs(img_dir, exist_ok=True)
jsonl_records = []
for row in verified:
filename = f"img_{row['image_id']}.png"
filepath = os.path.join(img_dir, filename)
with open(filepath, "wb") as f:
f.write(row["image_data"])
text = (row["predicted_text"] or "").strip()
if text:
jsonl_records.append({"image": filename, "text": text})
if not jsonl_records:
return None
random.shuffle(jsonl_records)
split_idx = int(len(jsonl_records) * (1 - val_ratio))
train_data = jsonl_records[:split_idx]
val_data = jsonl_records[split_idx:]
def save_jsonl(data, fname):
path = os.path.join(output_dir, fname)
with open(path, "w", encoding="utf-8") as f:
for rec in data:
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
return path
train_path = save_jsonl(train_data, "train.jsonl")
val_path = save_jsonl(val_data, "val.jsonl")
logger.info(
f"تم التصدير: {len(jsonl_records)} عينة "
f"(train={len(train_data)}, val={len(val_data)})"
)
return output_dir
def create_backup(config) -> str:
"""
إنشاء نسخة احتياطية شاملة.
— بدون استخدام !cp (تصحيح #1)
"""
label = datetime.now().strftime("%Y%m%d_%H%M%S")
bk_dir = os.path.join(config.backups_dir, f"backup_{label}")
os.makedirs(bk_dir, exist_ok=True)
files_to_backup = [
config.db_path,
config.feedback_csv,
config.stats_json,
config.correction_dict_path,
config.events_jsonl,
]
for p in files_to_backup:
if os.path.exists(p):
shutil.copy2(p, os.path.join(bk_dir, os.path.basename(p)))
# نسخ مجلد artifacts إذا وُجد
artifacts = config.artifacts_dir
if os.path.isdir(artifacts):
dest = os.path.join(bk_dir, "artifacts")
if not os.path.exists(dest):
shutil.copytree(artifacts, dest)
logger.info(f"تم إنشاء نسخة احتياطية: {bk_dir}")
return bk_dir
def push_to_huggingface(
local_dataset_dir: str,
hf_repo_id: str,
hf_token: str = "",
commit_message: str = "",
) -> bool:
"""
رفع البيانات الموثقة إلى HuggingFace Hub.
مع commit_message يحتوي التاريخ.
"""
try:
from huggingface_hub import HfApi, login
except ImportError:
logger.error("huggingface_hub غير مثبت")
return False
if not os.path.exists(local_dataset_dir):
logger.error(f"المجلد غير موجود: {local_dataset_dir}")
return False
if hf_token:
try:
login(token=hf_token)
except Exception as e:
logger.error(f"فشل تسجيل الدخول: {e}")
return False
api = HfApi()
try:
api.create_repo(
repo_id=hf_repo_id, repo_type="dataset", exist_ok=True
)
except Exception:
pass
# commit_message مع التاريخ
if not commit_message:
commit_message = f"Update dataset - {datetime.now().strftime('%Y-%m-%d %H:%M')}"
try:
api.upload_folder(
folder_path=local_dataset_dir,
repo_id=hf_repo_id,
repo_type="dataset",
commit_message=commit_message,
)
url = f"https://huggingface.co/datasets/{hf_repo_id}"
logger.info(f"تم رفع البيانات إلى {url}")
return True
except Exception as e:
logger.error(f"فشل الرفع: {e}")
return False
def export_pdf_report(db, output_path: str, title: str = "Handwriting OCR Report") -> str:
"""
تصدير تقرير PDF يحتوي على صور الكلمات والنصوص المصحّحة.
يتطلب: pip install fpdf2
"""
try:
from fpdf import FPDF
except ImportError:
logger.warning("fpdf2 غير مثبت — تثبيت: pip install fpdf2")
return ""
log_step(logger, "export_pdf_report", {"output_path": output_path})
words = db.get_all()
if not words:
logger.info("لا توجد بيانات لتصدير PDF")
return ""
class OCRReport(FPDF):
def header(self):
self.set_font('Helvetica', 'B', 14)
self.cell(0, 10, title, 0, 1, 'C')
self.ln(5)
def footer(self):
self.set_y(-15)
self.set_font('Helvetica', 'I', 8)
self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
pdf = OCRReport()
pdf.set_auto_page_break(auto=True, margin=15)
for i, word in enumerate(words):
if i % 4 == 0:
pdf.add_page()
# صورة الكلمة
if word.get("image_data"):
img_bytes = word["image_data"]
if isinstance(img_bytes, str):
img_bytes = bytes(img_bytes, 'latin-1')
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
tmp.write(img_bytes)
tmp.close()
try:
pdf.image(tmp.name, x=10, y=pdf.get_y(), w=40)
except Exception:
pass
os.unlink(tmp.name)
# النص
pdf.set_xy(55, pdf.get_y())
pdf.set_font('Helvetica', '', 10)
text = str(word.get("predicted_text", ""))
pdf.multi_cell(0, 8, txt=f"Text: {text}")
pdf.set_font('Helvetica', 'I', 8)
conf = word.get("confidence", 0)
page_num = word.get("page_num", "?")
status = word.get("status", "?")
pdf.cell(0, 5, f"Conf: {conf:.2%} | Page: {page_num} | Status: {status}", 0, 1)
pdf.ln(5)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
pdf.output(output_path)
file_size = os.path.getsize(output_path)
logger.info(f"تم تصدير PDF: {output_path} ({file_size} bytes, {len(words)} كلمة)")
return output_path