Spaces:
Sleeping
Sleeping
Spark Chou commited on
Commit ·
a90eaa6
1
Parent(s): a403a7e
new
Browse files- aa_test_sample_01.wav +0 -3
- app.py +23 -22
aa_test_sample_01.wav
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f69cf3999b506bed60487bd200b693fa6b94868cc089a50787b6fb0446be8559
|
| 3 |
-
size 2236140
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -50,43 +50,43 @@ DIMENSIONS_DATA = [
|
|
| 50 |
"title": "Semantic and Pragmatic Features",
|
| 51 |
"audio": sample1_audio_path,
|
| 52 |
"sub_dims": [
|
| 53 |
-
"Memory Consistency: Human memory in short contexts
|
| 54 |
-
"Logical Coherence: Human
|
| 55 |
-
"Pronunciation Accuracy: Human-like: Correct and natural pronunciation of words,
|
| 56 |
-
"Code-switching:
|
| 57 |
-
"
|
| 58 |
-
"
|
| 59 |
-
"Metaphor and Pragmatic Intent:
|
| 60 |
],
|
| 61 |
-
"reference_scores": [5, 5,
|
| 62 |
},
|
| 63 |
{
|
| 64 |
"title": "Non-Physiological Paralinguistic Features",
|
| 65 |
"audio": sample1_audio_path,
|
| 66 |
"sub_dims": [
|
| 67 |
-
"Rhythm: Human
|
| 68 |
-
"Intonation:
|
| 69 |
-
"
|
| 70 |
-
"Auxiliary Vocalizations:
|
| 71 |
],
|
| 72 |
-
"reference_scores": [
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"title": "Physiological Paralinguistic Features",
|
| 76 |
"audio": sample1_audio_path,
|
| 77 |
"sub_dims": [
|
| 78 |
-
"Micro-physiological Noise: Human
|
| 79 |
-
"
|
| 80 |
-
"Accent:
|
| 81 |
],
|
| 82 |
-
"reference_scores": [
|
| 83 |
},
|
| 84 |
{
|
| 85 |
"title": "Mechanical Persona",
|
| 86 |
"audio": sample1_audio_path,
|
| 87 |
"sub_dims": [
|
| 88 |
-
"
|
| 89 |
-
"
|
| 90 |
],
|
| 91 |
"reference_scores": [5, 5]
|
| 92 |
},
|
|
@@ -94,13 +94,14 @@ DIMENSIONS_DATA = [
|
|
| 94 |
"title": "Emotional Expression",
|
| 95 |
"audio": sample1_audio_path,
|
| 96 |
"sub_dims": [
|
| 97 |
-
"Semantic Level:
|
| 98 |
-
"Acoustic Level: Human
|
| 99 |
],
|
| 100 |
-
"reference_scores": [
|
| 101 |
}
|
| 102 |
]
|
| 103 |
|
|
|
|
| 104 |
DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
|
| 105 |
SPECIAL_KEYWORDS = ["Code-switching", "Metaphor and Pragmatic Intent", "Auxiliary Vocalizations", "Accent"]
|
| 106 |
MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
|
|
|
|
| 50 |
"title": "Semantic and Pragmatic Features",
|
| 51 |
"audio": sample1_audio_path,
|
| 52 |
"sub_dims": [
|
| 53 |
+
"Memory Consistency: Human-like: Consistent memory in short contexts, and asks for clarification when memory deviations occur; Machine-like: Inconsistent memory across contexts and unable to detect or correct errors (e.g., forgetting key information and insisting on incorrect answers)",
|
| 54 |
+
"Logical Coherence: Human-like: Natural and smooth logic; Machine-like: Abrupt logical transitions or self-contradictions (e.g., suddenly changing topics without transition)",
|
| 55 |
+
"Pronunciation Accuracy: Human-like: Correct and natural pronunciation of words, with proper usage of heteronyms based on context; Machine-like: Unnatural pronunciation errors, mispronunciation of heteronyms",
|
| 56 |
+
"Code-switching: Human-like: Multilingual mixing is often context-dependent (e.g., proper nouns, idiomatic expressions), and the switching between languages is smooth; Machine-like: Rigid multilingual mixing without logical language switching",
|
| 57 |
+
"Precision in Expression: Human-like: Uses vague expressions like 'more or less', 'probably', and self-correct (e.g., 'no, no'); Machine-like: Rarely uses vague expressions, responses are precise and affirmative",
|
| 58 |
+
"Use of Fillers: Human-like: Frequently uses fillers (e.g., 'um', 'like') while thinking; Machine-like: Rare use of fillers or unnatural usage",
|
| 59 |
+
"Metaphor and Pragmatic Intent: Human-like: Uses metaphor, irony, and euphemism to convey layered meanings; Machine-like: Literal and direct, lacking semantic diversity, only capable of surface-level interpretation"
|
| 60 |
],
|
| 61 |
+
"reference_scores": [5, 5, 5, 0, 5, 5, 0]
|
| 62 |
},
|
| 63 |
{
|
| 64 |
"title": "Non-Physiological Paralinguistic Features",
|
| 65 |
"audio": sample1_audio_path,
|
| 66 |
"sub_dims": [
|
| 67 |
+
"Rhythm: Human-like: Speaking rate varies with semantic flow, occasional pauses or hesitations; Machine-like: Almost no pauses or mechanical pauses",
|
| 68 |
+
"Intonation: Human-like: Natural pitch rise or fall when expressing questions, surprise, or emphasis; Machine-like: Monotonous or overly regular pitch changes, inappropriate to the context",
|
| 69 |
+
"Stress: Human-like: Consciously emphasizes key words to highlight focus; Machine-like: No emphasis on words or abnormal emphasis placement",
|
| 70 |
+
"Auxiliary Vocalizations: Human-like: Produces context-appropriate non-verbal sounds, such as laughter or sighs; Machine-like: Contextually incorrect or mechanical auxiliary sounds, or completely absent"
|
| 71 |
],
|
| 72 |
+
"reference_scores": [5, 5, 5, 5]
|
| 73 |
},
|
| 74 |
{
|
| 75 |
"title": "Physiological Paralinguistic Features",
|
| 76 |
"audio": sample1_audio_path,
|
| 77 |
"sub_dims": [
|
| 78 |
+
"Micro-physiological Noise: Human-like: Presence of breathing sounds, saliva sounds, bubble noise, etc., naturally occurring during speech; Machine-like: Speech is overly clean or emits unnatural noises (e.g., electrical static)",
|
| 79 |
+
"Instability in Pronunciation: Human-like: Some irregularities in pronunciation (e.g., liaison, tremolo, slurred speech, nasal sounds); Machine-like: Pronunciation is overly clear and regular",
|
| 80 |
+
"Accent: Human-like: Natural regional accent or vocal traits; Machine-like: Stiff or unnatural accent"
|
| 81 |
],
|
| 82 |
+
"reference_scores": [5, 4, 4]
|
| 83 |
},
|
| 84 |
{
|
| 85 |
"title": "Mechanical Persona",
|
| 86 |
"audio": sample1_audio_path,
|
| 87 |
"sub_dims": [
|
| 88 |
+
"Sycophant Behavior: Human-like: Judges whether to agree with requests or opinions based on context, doesn't always agree or echo; Machine-like: Frequently agrees, thanks, apologizes, excessively aligns with the other’s opinion, lacking genuine interaction",
|
| 89 |
+
"Written-style Expression: Human-like: Conversational, flexible, and varied expression; Machine-like: Responses are well-structured and formal, overly formal wording, frequent listing, and vague word choice"
|
| 90 |
],
|
| 91 |
"reference_scores": [5, 5]
|
| 92 |
},
|
|
|
|
| 94 |
"title": "Emotional Expression",
|
| 95 |
"audio": sample1_audio_path,
|
| 96 |
"sub_dims": [
|
| 97 |
+
"Semantic Level: Human-like: Displays human-like emotional responses to contexts such as sadness or joy; Machine-like: Fails to respond emotionally to the other’s feelings, or uses vague and context-inappropriate emotional language",
|
| 98 |
+
"Acoustic Level: Human-like: Pitch, volume, and rhythm dynamically change with emotion; Machine-like: Emotional tone is patterned or context-inappropriate"
|
| 99 |
],
|
| 100 |
+
"reference_scores": [5, 5]
|
| 101 |
}
|
| 102 |
]
|
| 103 |
|
| 104 |
+
|
| 105 |
DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
|
| 106 |
SPECIAL_KEYWORDS = ["Code-switching", "Metaphor and Pragmatic Intent", "Auxiliary Vocalizations", "Accent"]
|
| 107 |
MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
|