Spark Chou commited on
Commit
a90eaa6
·
1 Parent(s): a403a7e
Files changed (2) hide show
  1. aa_test_sample_01.wav +0 -3
  2. app.py +23 -22
aa_test_sample_01.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f69cf3999b506bed60487bd200b693fa6b94868cc089a50787b6fb0446be8559
3
- size 2236140
 
 
 
 
app.py CHANGED
@@ -50,43 +50,43 @@ DIMENSIONS_DATA = [
50
  "title": "Semantic and Pragmatic Features",
51
  "audio": sample1_audio_path,
52
  "sub_dims": [
53
- "Memory Consistency: Human memory in short contexts is usually consistent and self-correcting (e.g., by asking questions); machines may show inconsistent context memory and fail to notice or correct errors (e.g., forgetting key information and persisting in wrong answers).",
54
- "Logical Coherence: Human logic is naturally coherent and allows reasonable leaps; machine logic is abrupt or self-contradictory (e.g., sudden topic shifts without transitions).",
55
- "Pronunciation Accuracy: Human-like: Correct and natural pronunciation of words, including context-appropriate usage of common English heteronyms; Machine-like: Unnatural pronunciation errors, especially mispronunciation of common heteronyms",
56
- "Code-switching: Humans mix multiple languages fluently and contextually; machines mix languages rigidly, lacking logical language switching.",
57
- "Linguistic Vagueness: Human speech tends to include vague expressions (e.g., more or less, “I guess”) and self-corrections; machine responses are typically precise and assertive.",
58
- "Filler Word Usage: Human filler words (e.g., 'uh', 'like') appear randomly and show signs of thinking; machine fillers are either repetitive and patterned or completely absent.",
59
- "Metaphor and Pragmatic Intent: Humans use metaphors, irony, and euphemisms to express layered meanings; machines interpret literally or use rhetorical devices awkwardly, lacking semantic richness."
60
  ],
61
- "reference_scores": [5, 5, 3, 3, 5, 5, 3]
62
  },
63
  {
64
  "title": "Non-Physiological Paralinguistic Features",
65
  "audio": sample1_audio_path,
66
  "sub_dims": [
67
- "Rhythm: Human speech rate varies with meaning, occasionally hesitating or pausing; machine rhythm is uniform, with little or mechanical pauses.",
68
- "Intonation: Humans naturally raise or lower pitch to express questions, surprise, or emphasis; machine intonation is monotonous or overly patterned, mismatching the context.",
69
- "Emphasis: Humans consciously stress key words to highlight important information; machines have uniform word emphasis or stress incorrect parts.",
70
- "Auxiliary Vocalizations: Humans produce context-appropriate non-verbal sounds (e.g., laughter, sighs); machine non-verbal sounds are contextually incorrect, mechanical, or absent."
71
  ],
72
- "reference_scores": [4, 5, 4, 3]
73
  },
74
  {
75
  "title": "Physiological Paralinguistic Features",
76
  "audio": sample1_audio_path,
77
  "sub_dims": [
78
- "Micro-physiological Noise: Human speech includes unconscious physiological sounds like breathing, saliva, or bubbling, naturally woven into rhythm; machine speech is overly clean or adds unnatural noises.",
79
- "Pronunciation Instability: Human pronunciation includes irregularities (e.g., linking, tremors, slurring, nasal sounds); machine pronunciation is overly standard and uniform, lacking personality.",
80
- "Accent: Humans naturally exhibit regional accents or speech traits; machine accents sound forced or unnatural."
81
  ],
82
- "reference_scores": [3, 3, 4]
83
  },
84
  {
85
  "title": "Mechanical Persona",
86
  "audio": sample1_audio_path,
87
  "sub_dims": [
88
- "Sycophancy: Humans assess context to agree or disagree, sometimes offering differing opinions; machines excessively agree, thank, or apologize, over-validating the other party and lacking authentic interaction.",
89
- "Formal Expression: Human speech is flexible; machine responses are formally structured, overly written, and use vague wording."
90
  ],
91
  "reference_scores": [5, 5]
92
  },
@@ -94,13 +94,14 @@ DIMENSIONS_DATA = [
94
  "title": "Emotional Expression",
95
  "audio": sample1_audio_path,
96
  "sub_dims": [
97
- "Semantic Level: Humans show appropriate emotional responses to contexts like sadness or joy; machines are emotionally flat, or use emotional words vaguely and out of context.",
98
- "Acoustic Level: Human pitch, volume, and rhythm change dynamically with emotion; machine emotional tone is formulaic or mismatched with the context."
99
  ],
100
- "reference_scores": [3, 3]
101
  }
102
  ]
103
 
 
104
  DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
105
  SPECIAL_KEYWORDS = ["Code-switching", "Metaphor and Pragmatic Intent", "Auxiliary Vocalizations", "Accent"]
106
  MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
 
50
  "title": "Semantic and Pragmatic Features",
51
  "audio": sample1_audio_path,
52
  "sub_dims": [
53
+ "Memory Consistency: Human-like: Consistent memory in short contexts, and asks for clarification when memory deviations occur; Machine-like: Inconsistent memory across contexts and unable to detect or correct errors (e.g., forgetting key information and insisting on incorrect answers)",
54
+ "Logical Coherence: Human-like: Natural and smooth logic; Machine-like: Abrupt logical transitions or self-contradictions (e.g., suddenly changing topics without transition)",
55
+ "Pronunciation Accuracy: Human-like: Correct and natural pronunciation of words, with proper usage of heteronyms based on context; Machine-like: Unnatural pronunciation errors, mispronunciation of heteronyms",
56
+ "Code-switching: Human-like: Multilingual mixing is often context-dependent (e.g., proper nouns, idiomatic expressions), and the switching between languages is smooth; Machine-like: Rigid multilingual mixing without logical language switching",
57
+ "Precision in Expression: Human-like: Uses vague expressions like 'more or less', 'probably', and self-correct (e.g., 'no, no'); Machine-like: Rarely uses vague expressions, responses are precise and affirmative",
58
+ "Use of Fillers: Human-like: Frequently uses fillers (e.g., 'um', 'like') while thinking; Machine-like: Rare use of fillers or unnatural usage",
59
+ "Metaphor and Pragmatic Intent: Human-like: Uses metaphor, irony, and euphemism to convey layered meanings; Machine-like: Literal and direct, lacking semantic diversity, only capable of surface-level interpretation"
60
  ],
61
+ "reference_scores": [5, 5, 5, 0, 5, 5, 0]
62
  },
63
  {
64
  "title": "Non-Physiological Paralinguistic Features",
65
  "audio": sample1_audio_path,
66
  "sub_dims": [
67
+ "Rhythm: Human-like: Speaking rate varies with semantic flow, occasional pauses or hesitations; Machine-like: Almost no pauses or mechanical pauses",
68
+ "Intonation: Human-like: Natural pitch rise or fall when expressing questions, surprise, or emphasis; Machine-like: Monotonous or overly regular pitch changes, inappropriate to the context",
69
+ "Stress: Human-like: Consciously emphasizes key words to highlight focus; Machine-like: No emphasis on words or abnormal emphasis placement",
70
+ "Auxiliary Vocalizations: Human-like: Produces context-appropriate non-verbal sounds, such as laughter or sighs; Machine-like: Contextually incorrect or mechanical auxiliary sounds, or completely absent"
71
  ],
72
+ "reference_scores": [5, 5, 5, 5]
73
  },
74
  {
75
  "title": "Physiological Paralinguistic Features",
76
  "audio": sample1_audio_path,
77
  "sub_dims": [
78
+ "Micro-physiological Noise: Human-like: Presence of breathing sounds, saliva sounds, bubble noise, etc., naturally occurring during speech; Machine-like: Speech is overly clean or emits unnatural noises (e.g., electrical static)",
79
+ "Instability in Pronunciation: Human-like: Some irregularities in pronunciation (e.g., liaison, tremolo, slurred speech, nasal sounds); Machine-like: Pronunciation is overly clear and regular",
80
+ "Accent: Human-like: Natural regional accent or vocal traits; Machine-like: Stiff or unnatural accent"
81
  ],
82
+ "reference_scores": [5, 4, 4]
83
  },
84
  {
85
  "title": "Mechanical Persona",
86
  "audio": sample1_audio_path,
87
  "sub_dims": [
88
+ "Sycophant Behavior: Human-like: Judges whether to agree with requests or opinions based on context, doesn't always agree or echo; Machine-like: Frequently agrees, thanks, apologizes, excessively aligns with the other’s opinion, lacking genuine interaction",
89
+ "Written-style Expression: Human-like: Conversational, flexible, and varied expression; Machine-like: Responses are well-structured and formal, overly formal wording, frequent listing, and vague word choice"
90
  ],
91
  "reference_scores": [5, 5]
92
  },
 
94
  "title": "Emotional Expression",
95
  "audio": sample1_audio_path,
96
  "sub_dims": [
97
+ "Semantic Level: Human-like: Displays human-like emotional responses to contexts such as sadness or joy; Machine-like: Fails to respond emotionally to the other’s feelings, or uses vague and context-inappropriate emotional language",
98
+ "Acoustic Level: Human-like: Pitch, volume, and rhythm dynamically change with emotion; Machine-like: Emotional tone is patterned or context-inappropriate"
99
  ],
100
+ "reference_scores": [5, 5]
101
  }
102
  ]
103
 
104
+
105
  DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
106
  SPECIAL_KEYWORDS = ["Code-switching", "Metaphor and Pragmatic Intent", "Auxiliary Vocalizations", "Accent"]
107
  MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)