Spaces:

wt1711
/

lovebird25

Paused

App Files Files Community

Paul commited on Dec 1, 2025

Commit

1a6e95f

1 Parent(s): bbce197

update

Browse files

Files changed (2) hide show

build_reply_dataset.py +140 -0
finetune_model.py +52 -27

build_reply_dataset.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""
+Utility script to build a clean reply-generation dataset for Models 1 & 2.
+Goal:
+- Tạo ra dataset mà output luôn là **câu trả lời từ phía Nam**,
+  đã được chuẩn hoá theo prompt wingman (anh/em, 1 câu, ≤25 từ).
+Ý tưởng:
+- Đọc `new_data_selected.csv` gốc (user_text, partner_text, trigger_*, move_*)
+- Tính trigger/move chính (như khi fine-tune trigger detector)
+- Gom hội thoại: `Male: user_text ||| Female: partner_text`
+- Gọi wingman prompt-based (`ReplySuggestionService`) để sinh `male_reply`
+- Ghi ra CSV mới: `conversation,trigger,move,male_reply`
+Usage (local hoặc trong Spaces terminal):
+    python build_reply_dataset.py \
+        --data_path new_data_selected.csv \
+        --output_path reply_training_data.csv
+Lưu ý:
+- Script dùng Hugging Face Inference API, cần HF_TOKEN có quyền call model (như trong reply_service.py).
+"""
+import argparse
+import os
+from typing import List
+import pandas as pd
+from reply_service import ReplySuggestionService
+def _detect_columns(df: pd.DataFrame, prefix: str) -> List[str]:
+    cols = [col for col in df.columns if col.startswith(prefix)]
+    if not cols:
+        raise ValueError(f"No columns found with prefix '{prefix}'")
+    return cols
+def _get_active_label(row: pd.Series, cols: List[str], prefix: str) -> str:
+    """Lấy nhãn đầu tiên có giá trị 1, nếu không có thì trả về 'neutral'."""
+    for col in cols:
+        if float(row.get(col, 0)) == 1.0:
+            return col.replace(prefix, "")
+    return "neutral"
+def _build_conversation(user_text: str, partner_text: str) -> str:
+    user = (user_text or "").strip()
+    partner = (partner_text or "").strip()
+    if not user and not partner:
+        return ""
+    if user:
+        return f"Male: {user} ||| Female: {partner}"
+    return f"Female: {partner}"
+def main():
+    parser = argparse.ArgumentParser(description="Build reply-generation dataset from new_data_selected.csv")
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        default="new_data_selected.csv",
+        help="Đường dẫn đến CSV gốc (new_data_selected.csv)",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default="reply_training_data.csv",
+        help="Đường dẫn file CSV output chứa male_reply",
+    )
+    parser.add_argument(
+        "--max_rows",
+        type=int,
+        default=-1,
+        help="Giới hạn số dòng xử lý (debug). -1 = dùng toàn bộ",
+    )
+    args = parser.parse_args()
+    print(f"[BUILD_DATASET] Loading dataset from {args.data_path}")
+    df = pd.read_csv(args.data_path)
+    if args.max_rows > 0:
+        df = df.head(args.max_rows)
+        print(f"[BUILD_DATASET] Using first {len(df)} rows for reply synthesis")
+    trigger_cols = _detect_columns(df, "trigger_")
+    move_cols = _detect_columns(df, "move_")
+    # Khởi tạo wingman prompt-based
+    print("[BUILD_DATASET] Initializing ReplySuggestionService (Inference API)...")
+    reply_service = ReplySuggestionService()
+    out_rows = []
+    for idx, row in df.iterrows():
+        user_text = str(row.get("user_text", "") or "")
+        partner_text = str(row.get("partner_text", "") or "")
+        conversation = _build_conversation(user_text, partner_text)
+        if not conversation:
+            continue
+        trigger = _get_active_label(row, trigger_cols, "trigger_")
+        move = _get_active_label(row, move_cols, "move_")
+        try:
+            male_reply = reply_service.suggest_reply(
+                male=user_text,
+                female=partner_text,
+                tone=move,
+                intent=trigger,
+            )
+        except Exception as exc:
+            print(f"[BUILD_DATASET] Row {idx}: reply generation failed: {exc}")
+            continue
+        out_rows.append(
+            {
+                "conversation": conversation,
+                "user_text": user_text,
+                "partner_text": partner_text,
+                "trigger": trigger,
+                "move": move,
+                "male_reply": male_reply,
+            }
+        )
+        if (idx + 1) % 50 == 0:
+            print(f"[BUILD_DATASET] Processed {idx + 1} rows...")
+    out_df = pd.DataFrame(out_rows)
+    out_df.to_csv(args.output_path, index=False)
+    print(f"[BUILD_DATASET] Saved {len(out_df)} rows to {args.output_path}")
+if __name__ == "__main__":
+    main()

finetune_model.py CHANGED Viewed

@@ -66,30 +66,56 @@ def build_instruction(conversation: str, trigger: str, move: str, persona: str)
 def prepare_training_data(df, use_history=True, persona="default"):
-    """Prepare data for fine-tuning"""
     training_data = []
     conversation_history = []
     trigger_cols = [col for col in df.columns if col.startswith("trigger_")]
     move_cols = [col for col in df.columns if col.startswith("move_")]
-    for idx, row in df.iterrows():
-        user_text = str(row['user_text']) if pd.notna(row['user_text']) else ""
-        partner_text = str(row['partner_text']) if pd.notna(row['partner_text']) else ""
-        # Skip rows with invalid data
         if not partner_text or partner_text.strip() == "_":
             continue
-        # Get active triggers and moves
         active_triggers = get_active_labels(row, trigger_cols)
         active_moves = get_active_labels(row, move_cols)
-        # Format: Use only the first active trigger/move (highest priority)
         trigger = active_triggers[0] if active_triggers[0] != "none" else "neutral"
         move = active_moves[0] if active_moves[0] != "none" else "neutral"
-        # Build conversation context
         if use_history and conversation_history:
             history_str = "\n".join(conversation_history)
             if user_text and user_text.strip() != "_":
@@ -102,28 +128,27 @@ def prepare_training_data(df, use_history=True, persona="default"):
                 conversation = f"Male: {user_text} ||| Female: {partner_text}"
             else:
                 conversation = f"Female: {partner_text}"
         prompt = build_instruction(conversation, trigger, move, persona)
         response = partner_text.strip()
-        training_data.append({
-            "instruction": prompt,
-            "input": "",
-            "output": response
-        })
-        # Update conversation history
         if user_text and user_text.strip() != "_":
             conversation_history.append(f"Male: {user_text}")
         if partner_text and partner_text.strip() != "_":
             conversation_history.append(f"Female: {partner_text}")
-        # Limit history length
         max_history = 4
         if len(conversation_history) > max_history:
             conversation_history = conversation_history[-max_history:]
     return training_data

 def prepare_training_data(df, use_history=True, persona="default"):
+    """
+    Prepare data for fine-tuning.
+    Nếu dataset đã có cột `male_reply` (build bởi build_reply_dataset.py) thì dùng:
+        conversation, trigger, move, male_reply
+    Làm ground-truth chuẩn cho reply từ phía Nam.
+    Nếu không, fallback về logic cũ dựa trên user_text / partner_text (ít lý tưởng hơn).
+    """
     training_data = []
     conversation_history = []
+    has_clean_reply = {"conversation", "trigger", "move", "male_reply"}.issubset(set(df.columns))
+    if has_clean_reply:
+        for _, row in df.iterrows():
+            conversation = str(row.get("conversation", "") or "")
+            trigger = str(row.get("trigger", "") or "neutral")
+            move = str(row.get("move", "") or "neutral")
+            reply = str(row.get("male_reply", "") or "").strip()
+            if not conversation or not reply:
+                continue
+            prompt = build_instruction(conversation, trigger, move, persona)
+            training_data.append(
+                {
+                    "instruction": prompt,
+                    "input": "",
+                    "output": reply,
+                }
+            )
+        return training_data
+    # Fallback: dùng dữ liệu gốc (kém lý tưởng hơn)
     trigger_cols = [col for col in df.columns if col.startswith("trigger_")]
     move_cols = [col for col in df.columns if col.startswith("move_")]
+    for _, row in df.iterrows():
+        user_text = str(row["user_text"]) if pd.notna(row.get("user_text")) else ""
+        partner_text = str(row["partner_text"]) if pd.notna(row.get("partner_text")) else ""
         if not partner_text or partner_text.strip() == "_":
             continue
         active_triggers = get_active_labels(row, trigger_cols)
         active_moves = get_active_labels(row, move_cols)
         trigger = active_triggers[0] if active_triggers[0] != "none" else "neutral"
         move = active_moves[0] if active_moves[0] != "none" else "neutral"
         if use_history and conversation_history:
             history_str = "\n".join(conversation_history)
             if user_text and user_text.strip() != "_":
                 conversation = f"Male: {user_text} ||| Female: {partner_text}"
             else:
                 conversation = f"Female: {partner_text}"
         prompt = build_instruction(conversation, trigger, move, persona)
         response = partner_text.strip()
+        training_data.append(
+            {
+                "instruction": prompt,
+                "input": "",
+                "output": response,
+            }
+        )
         if user_text and user_text.strip() != "_":
             conversation_history.append(f"Male: {user_text}")
         if partner_text and partner_text.strip() != "_":
             conversation_history.append(f"Female: {partner_text}")
         max_history = 4
         if len(conversation_history) > max_history:
             conversation_history = conversation_history[-max_history:]
     return training_data