""" HandwrittenOCR - تصدير بيانات التدريب ورفع إلى HuggingFace v4.0 ==================================================================== المحسنات: - auto_export(): CSV + XLSX + نص كامل + JSONL تدريب - create_backup(): نسخ احتياطي شامل - push_to_huggingface(): مع commit_message يحتوي التاريخ """ import os import json import random import shutil import tempfile import logging from datetime import datetime from pathlib import Path from src.logger import log_step import pandas as pd logger = logging.getLogger("HandwrittenOCR") def auto_export( db, run_id: str, output_dir: str = None, config=None, ) -> dict: """ تصدير تلقائي شامل: CSV + XLSX + النص الكامل + JSONL تدريب. Parameters: db: كائن قاعدة البيانات run_id: معرف التشغيل output_dir: مجلد الإخراج (اختياري) config: كائن الإعدادات (اختياري) Returns: ملخص التصدير {files, total_words, verified, ...} """ if output_dir is None: if config: output_dir = os.path.join(config.exports_dir, "auto", run_id) else: output_dir = os.path.join("exports", "auto", run_id) os.makedirs(output_dir, exist_ok=True) # جلب البيانات words = db.get_all() if not words: logger.warning("لا توجد بيانات للتصدير") return {} df_all = pd.DataFrame(words) df_verified = df_all[ df_all["status"].isin(["verified", "sentence_corrected"]) ] df_csv = df_all.drop(columns=["image_data"], errors="ignore") exported = {} # --- CSV --- csv_path = os.path.join(output_dir, "all_words.csv") df_csv.to_csv(csv_path, index=False, encoding="utf-8-sig") exported["csv"] = csv_path # --- XLSX (مع ورقة لكل صفحة) --- try: xlsx_path = os.path.join(output_dir, "all_words.xlsx") with pd.ExcelWriter(xlsx_path, engine="openpyxl") as writer: df_csv.to_excel(writer, sheet_name="All", index=False) for pg in sorted(df_csv["page_num"].dropna().unique()): page_df = df_csv[df_csv["page_num"] == pg] page_df.to_excel(writer, sheet_name=f"P{int(pg)}", index=False) exported["xlsx"] = xlsx_path except ImportError: logger.warning("openpyxl غير مثبت - تخطي XLSX") # --- النص الكامل المُعاد بناؤه --- try: from src.reconstruction import reconstruct_sentences_direct text_lines = reconstruct_sentences_direct(df_all) text_path = os.path.join(output_dir, "reconstructed_text.txt") with open(text_path, "w", encoding="utf-8") as f: f.write("\n".join(text_lines)) exported["text"] = text_path except Exception as e: logger.warning(f"فشل إعادة بناء النص: {e}") # --- JSONL للتدريب --- if not df_verified.empty: img_dir = os.path.join(output_dir, "training_images") os.makedirs(img_dir, exist_ok=True) records = [] for _, row in df_verified.iterrows(): fname = f"img_{row['image_id']}.png" with open(os.path.join(img_dir, fname), "wb") as f: f.write(row["image_data"]) txt = (row["predicted_text"] or "").strip() if txt: records.append({"image": fname, "text": txt}) jsonl_path = os.path.join(output_dir, "training_data.jsonl") with open(jsonl_path, "w", encoding="utf-8") as f: for rec in records: f.write(json.dumps(rec, ensure_ascii=False) + "\n") exported["jsonl"] = jsonl_path exported["training_samples"] = len(records) summary = { "run_id": run_id, "exported_at": datetime.now().isoformat(), "total_words": len(df_all), "verified": len(df_verified), "dir": output_dir, "files": exported, } summary_path = os.path.join(output_dir, "export_summary.json") with open(summary_path, "w", encoding="utf-8") as f: json.dump(summary, f, ensure_ascii=False, indent=2) logger.info(f"تم التصدير التلقائي: {output_dir}") return summary def export_finetuning_dataset( db, output_dir: str, val_ratio: float = 0.1, ) -> str | None: """ تصدير البيانات الموثقة كبيانات تدريب JSONL مع train/val split. Parameters: db: كائن قاعدة البيانات output_dir: مجلد الإخراج val_ratio: نسبة بيانات التحقق Returns: مسار مجلد الإخراج أو None """ verified = db.get_verified() verified = [ w for w in verified if w.get("status") in ("verified", "sentence_corrected") ] if not verified: logger.warning("لا توجد بيانات موثقة للتصدير") return None os.makedirs(output_dir, exist_ok=True) img_dir = os.path.join(output_dir, "images") os.makedirs(img_dir, exist_ok=True) jsonl_records = [] for row in verified: filename = f"img_{row['image_id']}.png" filepath = os.path.join(img_dir, filename) with open(filepath, "wb") as f: f.write(row["image_data"]) text = (row["predicted_text"] or "").strip() if text: jsonl_records.append({"image": filename, "text": text}) if not jsonl_records: return None random.shuffle(jsonl_records) split_idx = int(len(jsonl_records) * (1 - val_ratio)) train_data = jsonl_records[:split_idx] val_data = jsonl_records[split_idx:] def save_jsonl(data, fname): path = os.path.join(output_dir, fname) with open(path, "w", encoding="utf-8") as f: for rec in data: f.write(json.dumps(rec, ensure_ascii=False) + "\n") return path train_path = save_jsonl(train_data, "train.jsonl") val_path = save_jsonl(val_data, "val.jsonl") logger.info( f"تم التصدير: {len(jsonl_records)} عينة " f"(train={len(train_data)}, val={len(val_data)})" ) return output_dir def create_backup(config) -> str: """ إنشاء نسخة احتياطية شاملة. — بدون استخدام !cp (تصحيح #1) """ label = datetime.now().strftime("%Y%m%d_%H%M%S") bk_dir = os.path.join(config.backups_dir, f"backup_{label}") os.makedirs(bk_dir, exist_ok=True) files_to_backup = [ config.db_path, config.feedback_csv, config.stats_json, config.correction_dict_path, config.events_jsonl, ] for p in files_to_backup: if os.path.exists(p): shutil.copy2(p, os.path.join(bk_dir, os.path.basename(p))) # نسخ مجلد artifacts إذا وُجد artifacts = config.artifacts_dir if os.path.isdir(artifacts): dest = os.path.join(bk_dir, "artifacts") if not os.path.exists(dest): shutil.copytree(artifacts, dest) logger.info(f"تم إنشاء نسخة احتياطية: {bk_dir}") return bk_dir def push_to_huggingface( local_dataset_dir: str, hf_repo_id: str, hf_token: str = "", commit_message: str = "", ) -> bool: """ رفع البيانات الموثقة إلى HuggingFace Hub. مع commit_message يحتوي التاريخ. """ try: from huggingface_hub import HfApi, login except ImportError: logger.error("huggingface_hub غير مثبت") return False if not os.path.exists(local_dataset_dir): logger.error(f"المجلد غير موجود: {local_dataset_dir}") return False if hf_token: try: login(token=hf_token) except Exception as e: logger.error(f"فشل تسجيل الدخول: {e}") return False api = HfApi() try: api.create_repo( repo_id=hf_repo_id, repo_type="dataset", exist_ok=True ) except Exception: pass # commit_message مع التاريخ if not commit_message: commit_message = f"Update dataset - {datetime.now().strftime('%Y-%m-%d %H:%M')}" try: api.upload_folder( folder_path=local_dataset_dir, repo_id=hf_repo_id, repo_type="dataset", commit_message=commit_message, ) url = f"https://huggingface.co/datasets/{hf_repo_id}" logger.info(f"تم رفع البيانات إلى {url}") return True except Exception as e: logger.error(f"فشل الرفع: {e}") return False def export_pdf_report(db, output_path: str, title: str = "Handwriting OCR Report") -> str: """ تصدير تقرير PDF يحتوي على صور الكلمات والنصوص المصحّحة. يتطلب: pip install fpdf2 """ try: from fpdf import FPDF except ImportError: logger.warning("fpdf2 غير مثبت — تثبيت: pip install fpdf2") return "" log_step(logger, "export_pdf_report", {"output_path": output_path}) words = db.get_all() if not words: logger.info("لا توجد بيانات لتصدير PDF") return "" class OCRReport(FPDF): def header(self): self.set_font('Helvetica', 'B', 14) self.cell(0, 10, title, 0, 1, 'C') self.ln(5) def footer(self): self.set_y(-15) self.set_font('Helvetica', 'I', 8) self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C') pdf = OCRReport() pdf.set_auto_page_break(auto=True, margin=15) for i, word in enumerate(words): if i % 4 == 0: pdf.add_page() # صورة الكلمة if word.get("image_data"): img_bytes = word["image_data"] if isinstance(img_bytes, str): img_bytes = bytes(img_bytes, 'latin-1') tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False) tmp.write(img_bytes) tmp.close() try: pdf.image(tmp.name, x=10, y=pdf.get_y(), w=40) except Exception: pass os.unlink(tmp.name) # النص pdf.set_xy(55, pdf.get_y()) pdf.set_font('Helvetica', '', 10) text = str(word.get("predicted_text", "")) pdf.multi_cell(0, 8, txt=f"Text: {text}") pdf.set_font('Helvetica', 'I', 8) conf = word.get("confidence", 0) page_num = word.get("page_num", "?") status = word.get("status", "?") pdf.cell(0, 5, f"Conf: {conf:.2%} | Page: {page_num} | Status: {status}", 0, 1) pdf.ln(5) os.makedirs(os.path.dirname(output_path), exist_ok=True) pdf.output(output_path) file_size = os.path.getsize(output_path) logger.info(f"تم تصدير PDF: {output_path} ({file_size} bytes, {len(words)} كلمة)") return output_path