dominic-fischer commited on
Commit
aa54010
·
1 Parent(s): 2c49b4b

unmix translations, i.e. make sure that the ones pertaining to the same idiom remain together

Browse files
Files changed (1) hide show
  1. app.py +27 -19
app.py CHANGED
@@ -21,33 +21,35 @@ def process_text(text):
21
 
22
  # Create a list to store token analyses
23
  token_analyses = []
24
-
25
  for token in doc.tokens:
26
- # For each token, get its lemmas and analyses
27
  token_info = {
28
  "token": token.text,
29
- "lemmas": {},
30
- "translations": []
31
  }
32
-
33
- # Get lemmas for the document's idiom
34
  for lemma, analyses in token.lemmas.items():
 
35
  if lemma.text not in token_info["lemmas"]:
36
- token_info["lemmas"][lemma.text] = []
37
-
 
 
 
 
38
  for analysis in analyses:
39
- # Handle case when analysis.features is None
40
  try:
41
  analysis_str = str(analysis)
42
  except AttributeError:
43
  analysis_str = "-"
44
- token_info["lemmas"][lemma.text].append(analysis_str)
45
-
46
- # Add German translations
47
- if lemma.translation_de != "null":
48
- token_info["translations"].append(f"{lemma.text}: {lemma.translation_de}")
49
-
50
  token_analyses.append(token_info)
 
51
 
52
  # Create DataFrame for token analysis
53
  df_tokens = pd.DataFrame([
@@ -55,16 +57,22 @@ def process_text(text):
55
  "Token": t["token"],
56
  "Lemma": "<br>".join([f"<b>{lemma}</b>" for lemma in t["lemmas"].keys()]),
57
  "German translations": "<br>".join([
58
- f"<span style='font-style: italic; font-weight: bold; color: #0028A5;'>{translation.split(':')[1].strip()}</span>"
59
- for translation in sorted(t["translations"], key=len)[:10]
 
 
 
 
60
  ]),
61
  "Morphological Analysis": "<br>".join([
62
- f"{'<br>'.join(list(set(map(str, analyses))))}<br>"
63
- for lemma, analyses in t["lemmas"].items()
 
64
  ])
65
  }
66
  for t in token_analyses
67
  ])
 
68
 
69
  # Create bar chart data for idiom scores using plotly
70
 
 
21
 
22
  # Create a list to store token analyses
23
  token_analyses = []
24
+
25
  for token in doc.tokens:
 
26
  token_info = {
27
  "token": token.text,
28
+ "lemmas": {}
 
29
  }
30
+
 
31
  for lemma, analyses in token.lemmas.items():
32
+ # Initialize lemma entry
33
  if lemma.text not in token_info["lemmas"]:
34
+ token_info["lemmas"][lemma.text] = {
35
+ "analyses": [],
36
+ "translations": []
37
+ }
38
+
39
+ # Collect analyses
40
  for analysis in analyses:
 
41
  try:
42
  analysis_str = str(analysis)
43
  except AttributeError:
44
  analysis_str = "-"
45
+ token_info["lemmas"][lemma.text]["analyses"].append(analysis_str)
46
+
47
+ # Collect lemma-specific translation
48
+ if getattr(lemma, "translation_de", None) and lemma.translation_de != "null":
49
+ token_info["lemmas"][lemma.text]["translations"].append(lemma.translation_de)
50
+
51
  token_analyses.append(token_info)
52
+
53
 
54
  # Create DataFrame for token analysis
55
  df_tokens = pd.DataFrame([
 
57
  "Token": t["token"],
58
  "Lemma": "<br>".join([f"<b>{lemma}</b>" for lemma in t["lemmas"].keys()]),
59
  "German translations": "<br>".join([
60
+ f"<b>{lemma}</b>: " +
61
+ "<br>".join([
62
+ f"<span style='font-style: italic; color: #0028A5;'>{tr}</span>"
63
+ for tr in lem_data["translations"]
64
+ ])
65
+ for lemma, lem_data in t["lemmas"].items()
66
  ]),
67
  "Morphological Analysis": "<br>".join([
68
+ f"<b>{lemma}</b>: " +
69
+ "<br>".join(sorted(set(lem_data["analyses"])))
70
+ for lemma, lem_data in t["lemmas"].items()
71
  ])
72
  }
73
  for t in token_analyses
74
  ])
75
+
76
 
77
  # Create bar chart data for idiom scores using plotly
78