Spaces:

davidtran999
/

hue-portal-backend-v2

Sleeping

App Files Files Community

davidtran999 commited on 9 days ago

Commit

9030829

verified ·

1 Parent(s): 1462240

Upload backend/scripts/etl_load.py with huggingface_hub

Browse files

Files changed (1) hide show

backend/scripts/etl_load.py +368 -0

backend/scripts/etl_load.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import argparse
+import csv
+import os
+import sys
+from datetime import datetime, date
+from pathlib import Path
+from typing import Dict, Optional
+import django
+from pydantic import BaseModel, ValidationError, field_validator
+ROOT_DIR = Path(__file__).resolve().parents[2]
+BACKEND_DIR = ROOT_DIR / "backend"
+HUE_PORTAL_DIR = BACKEND_DIR / "hue_portal"
+DEFAULT_DATA_DIR = ROOT_DIR / "tài nguyên"
+DATA_DIR = Path(os.environ.get("ETL_DATA_DIR", DEFAULT_DATA_DIR))
+LOG_DIR = ROOT_DIR / "backend" / "logs" / "data_quality"
+# Add backend directory to sys.path so Django can find hue_portal package
+# Django needs to import hue_portal.hue_portal.settings, so backend/ must be in path
+# IMPORTANT: Only add BACKEND_DIR, not HUE_PORTAL_DIR, because Django needs to find
+# the hue_portal package (which is in backend/hue_portal), not the hue_portal directory itself
+if str(BACKEND_DIR) not in sys.path:
+    sys.path.insert(0, str(BACKEND_DIR))
+# Add root for other imports if needed (but not HUE_PORTAL_DIR as it breaks Django imports)
+if str(ROOT_DIR) not in sys.path:
+    sys.path.insert(0, str(ROOT_DIR))
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "hue_portal.hue_portal.settings")
+django.setup()
+from hue_portal.core.models import Fine, Office, Procedure, Advisory  # noqa: E402
+LOG_DIR.mkdir(parents=True, exist_ok=True)
+class OfficeRecord(BaseModel):
+    unit_name: str
+    address: Optional[str] = ""
+    district: Optional[str] = ""
+    working_hours: Optional[str] = ""
+    phone: Optional[str] = ""
+    email: Optional[str] = ""
+    latitude: Optional[float]
+    longitude: Optional[float]
+    service_scope: Optional[str] = ""
+    updated_at: Optional[datetime]
+    @field_validator("unit_name")
+    @classmethod
+    def unit_name_not_blank(cls, value: str) -> str:
+        if not value:
+            raise ValueError("unit_name is required")
+        return value
+class FineRecord(BaseModel):
+    violation_code: str
+    violation_name: Optional[str] = ""
+    article: Optional[str] = ""
+    decree: Optional[str] = ""
+    min_fine: Optional[float]
+    max_fine: Optional[float]
+    license_points: Optional[str] = ""
+    remedial_measures: Optional[str] = ""
+    source_url: Optional[str] = ""
+    updated_at: Optional[datetime]
+    @field_validator("violation_code")
+    @classmethod
+    def code_not_blank(cls, value: str) -> str:
+        if not value:
+            raise ValueError("violation_code is required")
+        return value
+class ProcedureRecord(BaseModel):
+    title: str
+    domain: Optional[str] = ""
+    level: Optional[str] = ""
+    conditions: Optional[str] = ""
+    dossier: Optional[str] = ""
+    fee: Optional[str] = ""
+    duration: Optional[str] = ""
+    authority: Optional[str] = ""
+    source_url: Optional[str] = ""
+    updated_at: Optional[datetime]
+    @field_validator("title")
+    @classmethod
+    def title_not_blank(cls, value: str) -> str:
+        if not value:
+            raise ValueError("title is required")
+        return value
+class AdvisoryRecord(BaseModel):
+    title: str
+    summary: str
+    source_url: Optional[str] = ""
+    published_at: Optional[date]
+    @field_validator("title")
+    @classmethod
+    def title_not_blank(cls, value: str) -> str:
+        if not value:
+            raise ValueError("title is required")
+        return value
+    @field_validator("summary")
+    @classmethod
+    def summary_not_blank(cls, value: str) -> str:
+        if not value:
+            raise ValueError("summary is required")
+        return value
+def parse_datetime(value: Optional[str]) -> Optional[datetime]:
+    if not value:
+        return None
+    for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M:%S", "%Y/%m/%d", "%d/%m/%Y"):
+        try:
+            return datetime.strptime(value, fmt)
+        except ValueError:
+            continue
+    try:
+        return datetime.fromisoformat(value)
+    except ValueError:
+        return None
+def parse_date(value: Optional[str]) -> Optional[datetime]:
+    """Parse date string to datetime.date object (for Advisory.published_at)"""
+    if not value:
+        return None
+    for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%d/%m/%Y"):
+        try:
+            dt = datetime.strptime(value, fmt)
+            return dt.date()
+        except ValueError:
+            continue
+    return None
+def log_error(file_handle, dataset: str, row: Dict[str, str], error: str) -> None:
+    file_handle.write(
+        f"[{datetime.utcnow().isoformat()}Z] dataset={dataset} error={error} row={row}\n"
+    )
+def should_skip(updated_at: Optional[datetime], since: Optional[datetime]) -> bool:
+    if not since or not updated_at:
+        return False
+    return updated_at < since
+def load_offices(since: Optional[datetime], dry_run: bool, log_file) -> int:
+    path = DATA_DIR / "danh_ba_diem_tiep_dan.csv"
+    if not path.exists():
+        log_error(log_file, "offices", {}, f"File không tồn tại: {path}")
+        return 0
+    processed = 0
+    with path.open(encoding="utf-8") as handle:
+        reader = csv.DictReader(handle)
+        for row in reader:
+            row = {k: (v or "").strip() for k, v in row.items()}
+            for key in ["latitude", "longitude"]:
+                if row.get(key) == "":
+                    row[key] = None
+            row["updated_at"] = parse_datetime(row.get("updated_at"))
+            try:
+                record = OfficeRecord(**row)
+            except ValidationError as exc:
+                log_error(log_file, "offices", row, str(exc))
+                continue
+            if should_skip(record.updated_at, since):
+                continue
+            processed += 1
+            if dry_run:
+                continue
+            Office.objects.update_or_create(
+                unit_name=record.unit_name,
+                defaults={
+                    "address": record.address or "",
+                    "district": record.district or "",
+                    "working_hours": record.working_hours or "",
+                    "phone": record.phone or "",
+                    "email": record.email or "",
+                    "latitude": record.latitude,
+                    "longitude": record.longitude,
+                    "service_scope": record.service_scope or "",
+                },
+            )
+    return processed
+def load_fines(since: Optional[datetime], dry_run: bool, log_file) -> int:
+    path = DATA_DIR / "muc_phat_theo_hanh_vi.csv"
+    if not path.exists():
+        log_error(log_file, "fines", {}, f"File không tồn tại: {path}")
+        return 0
+    processed = 0
+    with path.open(encoding="utf-8") as handle:
+        reader = csv.DictReader(handle)
+        for row in reader:
+            row = {k: (v or "").strip() for k, v in row.items()}
+            for key in ["min_fine", "max_fine"]:
+                if row.get(key) == "":
+                    row[key] = None
+            row["updated_at"] = parse_datetime(row.get("updated_at"))
+            try:
+                record = FineRecord(**row)
+            except ValidationError as exc:
+                log_error(log_file, "fines", row, str(exc))
+                continue
+            if should_skip(record.updated_at, since):
+                continue
+            processed += 1
+            if dry_run:
+                continue
+            Fine.objects.update_or_create(
+                code=record.violation_code,
+                defaults={
+                    "name": record.violation_name or "",
+                    "article": record.article or "",
+                    "decree": record.decree or "",
+                    "min_fine": record.min_fine,
+                    "max_fine": record.max_fine,
+                    "license_points": record.license_points or "",
+                    "remedial": record.remedial_measures or "",
+                    "source_url": record.source_url or "",
+                },
+            )
+    return processed
+def load_procedures(since: Optional[datetime], dry_run: bool, log_file) -> int:
+    path = DATA_DIR / "thu_tuc_hanh_chinh.csv"
+    if not path.exists():
+        log_error(log_file, "procedures", {}, f"File không tồn tại: {path}")
+        return 0
+    processed = 0
+    with path.open(encoding="utf-8") as handle:
+        reader = csv.DictReader(handle)
+        for row in reader:
+            # Clean row: ensure keys and values are strings
+            clean_row = {}
+            for k, v in row.items():
+                key = str(k).strip() if k else ""
+                value = (v.strip() if isinstance(v, str) else str(v or "")) if v else ""
+                clean_row[key] = value
+            clean_row["updated_at"] = parse_datetime(clean_row.get("updated_at"))
+            try:
+                record = ProcedureRecord(**clean_row)
+            except ValidationError as exc:
+                log_error(log_file, "procedures", clean_row, str(exc))
+                continue
+            if should_skip(record.updated_at, since):
+                continue
+            processed += 1
+            if dry_run:
+                continue
+            Procedure.objects.update_or_create(
+                title=record.title,
+                domain=record.domain or "",
+                defaults={
+                    "level": record.level or "",
+                    "conditions": record.conditions or "",
+                    "dossier": record.dossier or "",
+                    "fee": record.fee or "",
+                    "duration": record.duration or "",
+                    "authority": record.authority or "",
+                    "source_url": record.source_url or "",
+                },
+            )
+    return processed
+def load_advisories(since: Optional[datetime], dry_run: bool, log_file) -> int:
+    path = DATA_DIR / "canh_bao_lua_dao.csv"
+    if not path.exists():
+        log_error(log_file, "advisories", {}, f"File không tồn tại: {path}")
+        return 0
+    processed = 0
+    with path.open(encoding="utf-8") as handle:
+        reader = csv.DictReader(handle)
+        for row in reader:
+            # Clean row: ensure keys and values are strings
+            clean_row = {}
+            for k, v in row.items():
+                key = str(k).strip() if k else ""
+                value = (v.strip() if isinstance(v, str) else str(v or "")) if v else ""
+                clean_row[key] = value
+            clean_row["published_at"] = parse_date(clean_row.get("published_at"))
+            try:
+                record = AdvisoryRecord(**clean_row)
+            except ValidationError as exc:
+                log_error(log_file, "advisories", clean_row, str(exc))
+                continue
+            # Advisory không có updated_at, chỉ check published_at nếu since được set
+            if since and record.published_at:
+                if record.published_at < since.date():
+                    continue
+            processed += 1
+            if dry_run:
+                continue
+            Advisory.objects.update_or_create(
+                title=record.title,
+                defaults={
+                    "summary": record.summary or "",
+                    "source_url": record.source_url or "",
+                    "published_at": record.published_at,
+                },
+            )
+    return processed
+def parse_args():
+    parser = argparse.ArgumentParser(description="ETL dữ liệu chatbot")
+    parser.add_argument("--since", help="Chỉ xử lý bản ghi có updated_at >= giá trị này (ISO date)")
+    parser.add_argument("--dry-run", action="store_true", help="Chỉ kiểm tra dữ liệu, không ghi vào DB")
+    parser.add_argument("--datasets", nargs="*", default=["offices", "fines"], choices=["offices", "fines", "procedures", "advisories"], help="Chọn dataset cần nạp")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    since = parse_datetime(args.since) if args.since else None
+    log_path = LOG_DIR / f"etl_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}.log"
+    with log_path.open("a", encoding="utf-8") as log_file:
+        if "offices" in args.datasets:
+            total = load_offices(since, args.dry_run, log_file)
+            print(f"Offices processed: {total}")
+        if "fines" in args.datasets:
+            total = load_fines(since, args.dry_run, log_file)
+            print(f"Fines processed: {total}")
+        if "procedures" in args.datasets:
+            total = load_procedures(since, args.dry_run, log_file)
+            print(f"Procedures processed: {total}")
+        if "advisories" in args.datasets:
+            total = load_advisories(since, args.dry_run, log_file)
+            print(f"Advisories processed: {total}")
+    print(f"Log ghi tại {log_path}")
+if __name__ == "__main__":
+    main()