File size: 923 Bytes
63fc25c
 
d6a1cb7
63fc25c
 
 
 
 
 
 
 
 
 
d6a1cb7
 
 
9d1441e
 
 
edf2758
d6a1cb7
 
9d1441e
d6a1cb7
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from datetime import datetime
import pandas as pd
import numpy as np

def now_utc_str() -> str:
    return datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")

def text_clean(s: str) -> str:
    s = (s or "").strip()
    s = s.replace("\u3000", " ").replace("\n", " ")
    return " ".join(s.split())

def load_sample_df() -> pd.DataFrame:
    try:
        return pd.read_csv("data/sample_multilingual_reviews.csv")
    except Exception:
        return pd.DataFrame({"text": [
            "音質は良いがアプリが使いづらい",
            "Great battery life, app UX is confusing",
            "El micrófono capta demasiado viento en bici",
            "ノイズキャンセリングは強力だが風の音に弱い",
            "앱의 초기 튜토리얼が分かりづらい",
        ]})

def normalize_rows(x: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
    return x / n