| from datetime import datetime | |
| import pandas as pd | |
| import numpy as np | |
| def now_utc_str() -> str: | |
| return datetime.utcnow().strftime("%Y%m%dT%H%M%SZ") | |
| def text_clean(s: str) -> str: | |
| s = (s or "").strip() | |
| s = s.replace("\u3000", " ").replace("\n", " ") | |
| return " ".join(s.split()) | |
| def load_sample_df() -> pd.DataFrame: | |
| try: | |
| return pd.read_csv("data/sample_multilingual_reviews.csv") | |
| except Exception: | |
| return pd.DataFrame({"text": [ | |
| "音質は良いがアプリが使いづらい", | |
| "Great battery life, app UX is confusing", | |
| "El micrófono capta demasiado viento en bici", | |
| "ノイズキャンセリングは強力だが風の音に弱い", | |
| "앱의 초기 튜토리얼が分かりづらい", | |
| ]}) | |
| def normalize_rows(x: np.ndarray) -> np.ndarray: | |
| n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12 | |
| return x / n | |