shahidul034 commited on
Commit
1db7196
·
verified ·
1 Parent(s): e0f16f8

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gradio/certificate.pem +31 -0
  2. assignment_sc_2/assignment_documentation.md +250 -0
  3. assignment_sc_2/code.py +320 -0
  4. assignment_sc_2/rubric_points_explanation.md +128 -0
  5. assignment_sc_2/rubric_points_explanation.pdf +137 -0
  6. code/attribution_eval.py +142 -0
  7. code/attribution_evalV2.py +222 -0
  8. code/combine_docid_labels.py +232 -0
  9. code/convert_awq.py +35 -0
  10. code/finetune-inference/convert_fp16.py +60 -0
  11. code/interface/annotators_v5.py +266 -0
  12. code/interface/annotators_v5_tran_quality.py +198 -0
  13. code/interface/instr +107 -0
  14. code/interface/instructions +43 -0
  15. code/interface/interface_correction_data.py +210 -0
  16. code/interface/t.py +8 -0
  17. code/interface/translate_gemma.py +78 -0
  18. code/interface/translation_quality.py +253 -0
  19. code/interface/translation_quality_v2.py +251 -0
  20. code/interface/vllm_app.py +46 -0
  21. code/interface/vllm_app_v2.py +115 -0
  22. code/key_subclaims_extract.py +109 -0
  23. code/literacy_thresholds.py +178 -0
  24. code/literacy_thresholds_v2.py +174 -0
  25. code/old/FH_es.py +86 -0
  26. code/old/FH_esV2.py +39 -0
  27. code/old/FH_fr.py +86 -0
  28. code/old/FH_pt.py +87 -0
  29. code/old/generate_thinking_data.ipynb +442 -0
  30. code/old/readability_controlv2.py +69 -0
  31. code/old/resonability_check_completeness_openai_V2.py +140 -0
  32. code/old/resonability_check_completeness_openai_V3.py +140 -0
  33. code/old/synthetic_data_generationV3.py +348 -0
  34. code/old/sz_es.py +68 -0
  35. code/rc.py +44 -0
  36. code/readability_final_res_process.ipynb +349 -0
  37. code/test.ipynb +64 -0
  38. code/text_classifier/dspy.ipynb +224 -0
  39. code/text_classifier/qwen3_(4b)_instruct.py +146 -0
  40. code/text_classifier/test_saved_dspy_vllm_gen_text_only.py +193 -0
  41. code/text_classifier/text_classifier_dspy.py +216 -0
  42. code/text_classifier/text_classifier_dspy_load_and_infer_full.py +353 -0
  43. code/text_classifier/text_classifier_dspy_only_gen_text.py +212 -0
  44. code/text_classifier/text_classifier_dspy_vllm.py +207 -0
  45. code/text_classifier/text_classifier_dspy_vllm_gen_text_only.py +203 -0
  46. code/text_classifier/text_classifier_dspy_vllm_test_cpp.py +115 -0
  47. code/translation_quality_check/calc_comet_bertscore_from_jsonl.py +274 -0
  48. code/translation_quality_check/eval_gpt52_translation.py +438 -0
  49. code/validation/data_gen_subclaims_support_valid_ch_gpt5.py +56 -0
  50. code/validation/subclaims_extr_valid_check_gpt5.py +56 -0
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
assignment_sc_2/assignment_documentation.md ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text-Attributed Network Analysis Documentation
2
+
3
+ This document explains how the implementation in `assignment_sc_2/code.py` addresses the assignment requirements and grading rubric.
4
+
5
+ ## 1. Objective
6
+
7
+ The assignment analyzes a network of research papers where:
8
+
9
+ - each node is a paper with metadata (`id`, `year`, `authors`, `title`, `abstract`),
10
+ - each edge represents semantic similarity between two papers,
11
+ - edge `weight` indicates tie strength (higher weight = stronger topical similarity).
12
+
13
+ The code loads `aclbib.graphml`, extracts the Largest Connected Component (LCC), and performs:
14
+
15
+ - weak/strong tie removal analysis,
16
+ - centrality analysis,
17
+ - centrality ranking correlation analysis,
18
+ - optional temporal topic-shift analysis.
19
+
20
+ ---
21
+
22
+ ## 2. Rubric Coverage Summary
23
+
24
+ ### (Part 2, 30%) Weak/Strong Ties and LCC Dynamics
25
+
26
+ Covered in `weaktie_analysis(LCC)`:
27
+
28
+ - ties are ordered by weight to represent weak-to-strong and strong-to-weak removal,
29
+ - two experiments are run:
30
+ - removing weakest ties first,
31
+ - removing strongest ties first,
32
+ - after each single edge removal, LCC size is recomputed,
33
+ - x-axis is fraction of ties removed,
34
+ - y-axis is LCC size (number of nodes).
35
+
36
+ Note: The implementation uses rank-based weak/strong definitions (by sorted weights). If explicit threshold-based counts are required by instructor policy, add a threshold rule (e.g., bottom/top quartile) and print those counts.
37
+
38
+ ### (Part 2, 35%) Centrality + Central Papers + Correlation + Interpretation
39
+
40
+ Covered in `centrality_analysis(LCC)`:
41
+
42
+ - computes degree, closeness, and betweenness centrality,
43
+ - identifies top 10 papers for each metric,
44
+ - outputs entries in `ID<TAB>Title` format,
45
+ - converts centrality scores to ranking vectors,
46
+ - computes Pearson correlation between metric rankings,
47
+ - prints a correlation table,
48
+ - identifies the lowest-correlation pair,
49
+ - provides interpretation grounded in metric definitions.
50
+
51
+ ### (Part 2, 10%) Report Quality
52
+
53
+ This markdown report provides:
54
+
55
+ - clear method descriptions,
56
+ - consistent structure by rubric item,
57
+ - direct mapping from requirements to implementation,
58
+ - interpretation guidance and limitations.
59
+
60
+ ### (Part 2, Optional Extra Credit, 50%) Research Evolution Analysis
61
+
62
+ Covered in `research_evolution_analysis(G)`:
63
+
64
+ - splits papers into before-2023 and after-2023 groups,
65
+ - tokenizes title + abstract,
66
+ - builds a shared global dictionary (vocabulary),
67
+ - trains LDA models for both groups using same vocabulary,
68
+ - obtains comparable topic-term matrices:
69
+ - `D` for pre-2023,
70
+ - `S` for post-2023,
71
+ - computes topic shift using cosine similarity,
72
+ - ranks potentially disappearing and emerging themes,
73
+ - prints top words for contextual interpretation.
74
+
75
+ ---
76
+
77
+ ## 3. Detailed Methodology
78
+
79
+ ## 3.1 Data Loading and LCC Extraction
80
+
81
+ 1. Load graph from `aclbib.graphml`.
82
+ 2. Extract the largest connected component:
83
+ - this ensures path-based metrics (closeness, betweenness) are meaningful and comparable.
84
+
85
+ ---
86
+
87
+ ## 3.2 Weak vs Strong Tie Analysis
88
+
89
+ ### Definitions
90
+
91
+ - Weak ties: lower edge weights (lower semantic similarity).
92
+ - Strong ties: higher edge weights (higher semantic similarity).
93
+
94
+ ### Procedure
95
+
96
+ 1. Sort edges by weight ascending (`weak -> strong`).
97
+ 2. Create reversed order (`strong -> weak`).
98
+ 3. For each removal order:
99
+ - remove one edge at a time,
100
+ - recompute LCC size after each removal,
101
+ - record:
102
+ - fraction removed = removed_edges / total_edges,
103
+ - LCC size = number of nodes in current largest connected component.
104
+ 4. Plot both removal curves.
105
+
106
+ ### What this shows
107
+
108
+ - If removing weak ties first rapidly fragments the network, weak ties are acting as bridges.
109
+ - If removing strong ties first causes larger impact, strong ties are most critical to global cohesion.
110
+
111
+ ---
112
+
113
+ ## 3.3 Centrality Analysis
114
+
115
+ ### Metrics
116
+
117
+ - Degree centrality: local connectivity prominence.
118
+ - Closeness centrality: global proximity to all nodes.
119
+ - Betweenness centrality: control over shortest-path flow.
120
+
121
+ ### Output
122
+
123
+ - Top 10 papers for each metric, as `ID<TAB>Title`.
124
+ - These lists identify influential papers under different notions of centrality.
125
+
126
+ ---
127
+
128
+ ## 3.4 Correlation Between Centrality Rankings
129
+
130
+ The assignment requests correlation between rankings, not raw centrality values.
131
+
132
+ ### Procedure
133
+
134
+ 1. Convert each metric score map into rank vector (rank 1 = highest centrality).
135
+ 2. Compute Pearson correlation for each pair:
136
+ - Degree vs Closeness,
137
+ - Degree vs Betweenness,
138
+ - Closeness vs Betweenness.
139
+ 3. Build and print correlation table.
140
+ 4. Find lowest-correlation pair and print interpretation.
141
+
142
+ ### Interpretation principle
143
+
144
+ Low correlation occurs when two metrics encode different structural roles, e.g.:
145
+
146
+ - local popularity (degree) vs bridge control (betweenness),
147
+ - global distance efficiency (closeness) vs brokerage roles (betweenness).
148
+
149
+ ---
150
+
151
+ ## 3.5 Optional Extra Credit: Research Evolution
152
+
153
+ ### Goal
154
+
155
+ Trace thematic shifts in research trends before and after 2023.
156
+
157
+ ### Procedure
158
+
159
+ 1. Split nodes by publication year:
160
+ - before 2023,
161
+ - 2023 and later.
162
+ 2. Build documents from title + abstract.
163
+ 3. Tokenize and clean text.
164
+ 4. Create one shared vocabulary dictionary for both groups.
165
+ 5. Train two LDA models (same vocabulary, separate corpora).
166
+ 6. Extract topic-term matrices:
167
+ - `D` (pre-2023),
168
+ - `S` (post-2023).
169
+ 7. Compute shift score for each topic:
170
+ - shift = `1 - max cosine similarity` to any topic in opposite period.
171
+ 8. Rank:
172
+ - pre-2023 topics with highest shift (potentially disappearing),
173
+ - post-2023 topics with highest shift (potentially emerging).
174
+ 9. Print top words for each ranked topic.
175
+
176
+ ### Why this is valid
177
+
178
+ - Shared vocabulary ensures `D` and `S` are directly comparable.
179
+ - Cosine similarity captures semantic overlap between topic distributions.
180
+ - Ranking by shift provides interpretable emergence/disappearance candidates.
181
+
182
+ ---
183
+
184
+ ## 4. Observed Results from Current Run
185
+
186
+ The following results were generated by running:
187
+
188
+ `python /home/mshahidul/readctrl/assignment_sc_2/code.py`
189
+
190
+ ### 4.1 Network and LCC Summary
191
+
192
+ - LCC contains `1662` nodes and `26134` edges.
193
+ - This indicates analysis is performed on a large connected core, suitable for centrality and connectivity experiments.
194
+
195
+ ### 4.2 Centrality Correlation Results
196
+
197
+ Pearson correlation between centrality rankings:
198
+
199
+ | Metric | Degree | Closeness | Betweenness |
200
+ |---|---:|---:|---:|
201
+ | Degree | 1.0000 | 0.9361 | 0.8114 |
202
+ | Closeness | 0.9361 | 1.0000 | 0.7684 |
203
+ | Betweenness | 0.8114 | 0.7684 | 1.0000 |
204
+
205
+ - Lowest-correlation pair: **Closeness vs Betweenness** (`r = 0.7684`).
206
+ - Interpretation: closeness captures global proximity, while betweenness captures shortest-path brokerage; these are related but not identical structural roles.
207
+
208
+ ### 4.3 Central Papers (Top-10) Highlights
209
+
210
+ Across Degree, Closeness, and Betweenness top-10 lists, several papers repeatedly appear, including:
211
+
212
+ - `ahuja-etal-2023-mega` (`{MEGA}: Multilingual Evaluation of Generative {AI}`),
213
+ - `ding-etal-2020-discriminatively`,
214
+ - `shin-etal-2020-autoprompt`,
215
+ - `weller-etal-2020-learning`,
216
+ - `qin-etal-2023-chatgpt`.
217
+
218
+ This overlap suggests robust influence of these papers across local connectivity, global accessibility, and bridge-like structural importance.
219
+
220
+ ### 4.4 Optional Topic Evolution Results
221
+
222
+ Topic matrices:
223
+
224
+ - `D` (before 2023): shape `(5, 5000)`
225
+ - `S` (after 2023): shape `(5, 5000)`
226
+
227
+ Top potentially disappearing theme example:
228
+
229
+ - Before Topic 4, shift `0.1912`, keywords:
230
+ `question, knowledge, event, performance, questions, task, graph, can`
231
+
232
+ Top potentially emerging theme example:
233
+
234
+ - After Topic 2, shift `0.1989`, keywords:
235
+ `llms, large, data, tasks, knowledge, reasoning, generation, performance`
236
+
237
+ Interpretation: post-2023 topics show stronger emphasis on **LLMs**, reasoning, and generation-centered trends.
238
+
239
+ ---
240
+
241
+ ## 5. Limitations and Practical Notes
242
+
243
+ - Weak/strong tie counts are currently implicit via sorted order; explicit threshold-based counts can be added if required.
244
+ - Topic modeling quality depends on preprocessing and corpus size.
245
+ - Interpretation quality in final report should connect output topics/central papers to real NLP/AI trends for stronger grading.
246
+
247
+ ---
248
+
249
+
250
+
assignment_sc_2/code.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Author: Md. Shahidul Salim
2
+ # Date: February 12, 2026
3
+
4
+ import networkx as nx
5
+ import pandas as pd
6
+ from scipy.stats import pearsonr
7
+ import numpy as np
8
+ import matplotlib.pyplot as plt
9
+
10
+ # Extra credit imports
11
+ from gensim.models.ldamodel import LdaModel
12
+ from gensim.corpora.dictionary import Dictionary
13
+ import nltk
14
+ from nltk.tokenize import word_tokenize
15
+
16
+
17
+ # Ensure NLTK resources are available
18
+ try:
19
+ nltk.data.find("tokenizers/punkt")
20
+ except LookupError:
21
+ nltk.download("punkt", quiet=True)
22
+
23
+ try:
24
+ nltk.data.find("tokenizers/punkt_tab")
25
+ except LookupError:
26
+ # Required by some NLTK versions.
27
+ nltk.download("punkt_tab", quiet=True)
28
+
29
+
30
+ def _safe_int_year(value):
31
+ try:
32
+ return int(value)
33
+ except (TypeError, ValueError):
34
+ return 0
35
+
36
+
37
+ def _rank_vector(scores, node_order):
38
+ """
39
+ Convert centrality scores to rank vectors (1 = highest centrality),
40
+ which matches the assignment requirement to correlate rankings.
41
+ """
42
+ series = pd.Series({node: scores[node] for node in node_order})
43
+ ranks = series.rank(method="average", ascending=False)
44
+ return [float(ranks[node]) for node in node_order]
45
+
46
+
47
+ def _tokenize(text):
48
+ tokens = word_tokenize(text.lower())
49
+ return [tok for tok in tokens if tok.isalpha() and len(tok) > 2]
50
+
51
+
52
+ # Part 1: Weak Tie Analysis
53
+ def weaktie_analysis(LCC):
54
+ print("\n--- Starting Weak/Strong Tie Analysis ---")
55
+ edges_asc = sorted(
56
+ LCC.edges(data=True), key=lambda x: float(x[2].get("weight", 0.0))
57
+ )
58
+ edges_desc = list(reversed(edges_asc))
59
+ edge_weights = [float(data.get("weight", 0.0)) for _, _, data in edges_asc]
60
+ total_edges = len(edge_weights)
61
+
62
+ if total_edges == 0:
63
+ print("No ties found in the LCC; skipping weak/strong tie removal analysis.")
64
+ return
65
+
66
+ # Use median edge weight as the cutoff:
67
+ # weak ties: weight <= median, strong ties: weight > median.
68
+ median_weight = float(np.median(edge_weights))
69
+ weak_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) <= median_weight]
70
+ strong_ties = [(u, v, d) for u, v, d in edges_asc if float(d.get("weight", 0.0)) > median_weight]
71
+
72
+ print(f"Total ties in LCC: {total_edges}")
73
+ print(f"Weak tie threshold (median weight): {median_weight:.4f}")
74
+ print(f"Number of weak ties (weight <= {median_weight:.4f}): {len(weak_ties)}")
75
+ print(f"Number of strong ties (weight > {median_weight:.4f}): {len(strong_ties)}")
76
+ print("Methodology: remove one tie per step and recompute LCC size after each removal.")
77
+
78
+ def get_lcc_sizes_by_single_removal(edge_list):
79
+ temp_graph = LCC.copy()
80
+ total_edges = len(edge_list)
81
+ fractions_removed = [0.0]
82
+ lcc_sizes = [len(max(nx.connected_components(temp_graph), key=len))]
83
+
84
+ for idx, (u, v, _) in enumerate(edge_list, start=1):
85
+ if temp_graph.has_edge(u, v):
86
+ temp_graph.remove_edge(u, v)
87
+
88
+ if temp_graph.number_of_nodes() > 0:
89
+ current_lcc = max(nx.connected_components(temp_graph), key=len)
90
+ lcc_sizes.append(len(current_lcc))
91
+ else:
92
+ lcc_sizes.append(0)
93
+ fractions_removed.append(idx / total_edges)
94
+
95
+ return fractions_removed, lcc_sizes
96
+
97
+ x_weak, y_weak = get_lcc_sizes_by_single_removal(edges_asc)
98
+ x_strong, y_strong = get_lcc_sizes_by_single_removal(edges_desc)
99
+
100
+ plt.figure(figsize=(10, 6))
101
+ plt.plot(x_weak, y_weak, label="Removing Weakest First")
102
+ plt.plot(x_strong, y_strong, label="Removing Strongest First")
103
+ plt.xlabel("Fraction of Ties Removed")
104
+ plt.ylabel("LCC Size (Number of Nodes)")
105
+ plt.title("Impact of Weak vs Strong Tie Removal on LCC")
106
+ plt.legend()
107
+ plt.grid(True, linestyle="--", alpha=0.7)
108
+ plt.tight_layout()
109
+ plt.show()
110
+
111
+
112
+ # Part 2: Centrality Analysis
113
+ def centrality_analysis(LCC):
114
+ print("\n--- Starting Centrality Analysis ---")
115
+
116
+ degree = nx.degree_centrality(LCC)
117
+ closeness = nx.closeness_centrality(LCC)
118
+ betweenness = nx.betweenness_centrality(LCC)
119
+
120
+ nodes = list(LCC.nodes())
121
+ d_rank = _rank_vector(degree, nodes)
122
+ c_rank = _rank_vector(closeness, nodes)
123
+ b_rank = _rank_vector(betweenness, nodes)
124
+
125
+ corr_dc, _ = pearsonr(d_rank, c_rank)
126
+ corr_db, _ = pearsonr(d_rank, b_rank)
127
+ corr_cb, _ = pearsonr(c_rank, b_rank)
128
+
129
+ print("\nTable 1: Pearson Correlation between Centrality Measure Rankings")
130
+ table = pd.DataFrame(
131
+ {
132
+ "Metric": ["Degree", "Closeness", "Betweenness"],
133
+ "Degree": [1.0, corr_dc, corr_db],
134
+ "Closeness": [corr_dc, 1.0, corr_cb],
135
+ "Betweenness": [corr_db, corr_cb, 1.0],
136
+ }
137
+ )
138
+ print(table.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
139
+
140
+ pair_corr = {
141
+ ("Degree", "Closeness"): corr_dc,
142
+ ("Degree", "Betweenness"): corr_db,
143
+ ("Closeness", "Betweenness"): corr_cb,
144
+ }
145
+ lowest_pair, lowest_value = min(pair_corr.items(), key=lambda x: x[1])
146
+ highest_pair, highest_value = max(pair_corr.items(), key=lambda x: x[1])
147
+ print(
148
+ f"\nLowest-correlation pair: {lowest_pair[0]} vs {lowest_pair[1]} "
149
+ f"(r = {lowest_value:.4f})"
150
+ )
151
+ print(
152
+ f"Highest-correlation pair: {highest_pair[0]} vs {highest_pair[1]} "
153
+ f"(r = {highest_value:.4f})"
154
+ )
155
+
156
+ explanations = {
157
+ frozenset(("Degree", "Closeness")): (
158
+ "Degree is local (immediate neighbors), while closeness captures "
159
+ "global shortest-path proximity to all nodes."
160
+ ),
161
+ frozenset(("Degree", "Betweenness")): (
162
+ "High degree does not always imply bridge-like behavior; betweenness "
163
+ "emphasizes control over shortest paths across communities."
164
+ ),
165
+ frozenset(("Closeness", "Betweenness")): (
166
+ "Closeness rewards overall proximity, while betweenness rewards "
167
+ "being on critical routes between other nodes."
168
+ ),
169
+ }
170
+ print(f"Interpretation: {explanations[frozenset(lowest_pair)]}")
171
+ print(
172
+ "Correlation quality note: values closer to 1 indicate stronger agreement "
173
+ "between ranking-based notions of node importance."
174
+ )
175
+
176
+ metrics = {"Degree": degree, "Closeness": closeness, "Betweenness": betweenness}
177
+ top_nodes_by_metric = {}
178
+ for metric_name, score_map in metrics.items():
179
+ print(f"\nTop 10 Papers for {metric_name} (ID<TAB>Title<TAB>Score):")
180
+ top_10 = sorted(score_map.items(), key=lambda x: x[1], reverse=True)[:10]
181
+ top_nodes_by_metric[metric_name] = [node_id for node_id, _ in top_10]
182
+ for node_id, _ in top_10:
183
+ title = LCC.nodes[node_id].get("title", "Unknown Title")
184
+ print(f"{node_id}\t{title}\t{score_map[node_id]:.6f}")
185
+
186
+ # Identify papers that appear in multiple top-10 lists (robust centrality evidence).
187
+ top_presence = {}
188
+ for metric_name, node_ids in top_nodes_by_metric.items():
189
+ for node_id in node_ids:
190
+ if node_id not in top_presence:
191
+ top_presence[node_id] = []
192
+ top_presence[node_id].append(metric_name)
193
+
194
+ repeated = [
195
+ (node_id, sorted(metric_names))
196
+ for node_id, metric_names in top_presence.items()
197
+ if len(metric_names) >= 2
198
+ ]
199
+ repeated.sort(key=lambda x: (-len(x[1]), x[0]))
200
+
201
+ if repeated:
202
+ print("\nPapers repeated across multiple centrality top-10 lists:")
203
+ for node_id, metric_names in repeated:
204
+ title = LCC.nodes[node_id].get("title", "Unknown Title")
205
+ print(f"{node_id}\t{title}\tappears in: {', '.join(metric_names)}")
206
+ else:
207
+ print("\nNo paper appears in more than one top-10 centrality list.")
208
+
209
+
210
+ # Part 3: Research Evolution (Optional Extra Credit)
211
+ def research_evolution_analysis(G, num_topics=5):
212
+ print("\n--- Optional: Research Evolution Analysis ---")
213
+
214
+ before_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) < 2023]
215
+ after_nodes = [n for n, d in G.nodes(data=True) if _safe_int_year(d.get("year")) >= 2023]
216
+
217
+ before_docs = []
218
+ for n in before_nodes:
219
+ title = G.nodes[n].get("title", "")
220
+ abstract = G.nodes[n].get("abstract", "")
221
+ text = f"{title} {abstract}".strip()
222
+ tokens = _tokenize(text) if text else []
223
+ if tokens:
224
+ before_docs.append(tokens)
225
+
226
+ after_docs = []
227
+ for n in after_nodes:
228
+ title = G.nodes[n].get("title", "")
229
+ abstract = G.nodes[n].get("abstract", "")
230
+ text = f"{title} {abstract}".strip()
231
+ tokens = _tokenize(text) if text else []
232
+ if tokens:
233
+ after_docs.append(tokens)
234
+
235
+ if not before_docs or not after_docs:
236
+ print("Insufficient tokenized documents before/after 2023 for topic comparison.")
237
+ return
238
+
239
+ # Shared dictionary gives a single global vocabulary n for both matrices.
240
+ dictionary = Dictionary(before_docs + after_docs)
241
+ dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=5000)
242
+ if len(dictionary) == 0:
243
+ print("Vocabulary became empty after filtering; skipping extra credit analysis.")
244
+ return
245
+
246
+ before_corpus = [dictionary.doc2bow(doc) for doc in before_docs]
247
+ after_corpus = [dictionary.doc2bow(doc) for doc in after_docs]
248
+ before_corpus = [bow for bow in before_corpus if bow]
249
+ after_corpus = [bow for bow in after_corpus if bow]
250
+
251
+ if not before_corpus or not after_corpus:
252
+ print("Insufficient BOW documents after vocabulary filtering.")
253
+ return
254
+
255
+ lda_before = LdaModel(
256
+ corpus=before_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42
257
+ )
258
+ lda_after = LdaModel(
259
+ corpus=after_corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=42
260
+ )
261
+
262
+ # D and S correspond to topic-term probability matrices with shared vocabulary.
263
+ D = lda_before.get_topics() # shape: (k1, n)
264
+ S = lda_after.get_topics() # shape: (k2, n)
265
+ print(f"D matrix shape (before): {D.shape}")
266
+ print(f"S matrix shape (after): {S.shape}")
267
+
268
+ def cosine_similarity(a, b):
269
+ denom = np.linalg.norm(a) * np.linalg.norm(b)
270
+ if denom == 0:
271
+ return 0.0
272
+ return float(np.dot(a, b) / denom)
273
+
274
+ before_shift = []
275
+ for i in range(D.shape[0]):
276
+ sims = [cosine_similarity(D[i], S[j]) for j in range(S.shape[0])]
277
+ before_shift.append((i, 1.0 - max(sims) if sims else 1.0))
278
+
279
+ after_shift = []
280
+ for j in range(S.shape[0]):
281
+ sims = [cosine_similarity(S[j], D[i]) for i in range(D.shape[0])]
282
+ after_shift.append((j, 1.0 - max(sims) if sims else 1.0))
283
+
284
+ before_shift.sort(key=lambda x: x[1], reverse=True)
285
+ after_shift.sort(key=lambda x: x[1], reverse=True)
286
+
287
+ def top_words(topic_vec, topn=8):
288
+ idx = np.argsort(topic_vec)[::-1][:topn]
289
+ return ", ".join(dictionary[i] for i in idx)
290
+
291
+ print("\nPotentially disappearing themes (before topics with largest shift):")
292
+ for topic_id, shift_score in before_shift:
293
+ print(f"Before Topic {topic_id} | shift={shift_score:.4f} | {top_words(D[topic_id])}")
294
+
295
+ print("\nPotentially emerging themes (after topics with largest shift):")
296
+ for topic_id, shift_score in after_shift:
297
+ print(f"After Topic {topic_id} | shift={shift_score:.4f} | {top_words(S[topic_id])}")
298
+
299
+
300
+ def main():
301
+ try:
302
+ G = nx.read_graphml("aclbib.graphml")
303
+ except Exception as e:
304
+ print(f"Error loading graph file: {e}")
305
+ return
306
+
307
+ LCC_nodes = max(nx.connected_components(G), key=len)
308
+ LCC = G.subgraph(LCC_nodes).copy()
309
+ print(
310
+ f"Network loaded. LCC contains {len(LCC.nodes())} nodes and "
311
+ f"{len(LCC.edges())} edges."
312
+ )
313
+
314
+ weaktie_analysis(LCC)
315
+ centrality_analysis(LCC)
316
+ research_evolution_analysis(G)
317
+
318
+
319
+ if __name__ == "__main__":
320
+ main()
assignment_sc_2/rubric_points_explanation.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Part 2 Rubric Explanation
2
+ ## 1) Weak/strong ties and LCC change during removal
3
+
4
+ Tie strength is defined by edge `weight` in the LCC.
5
+
6
+ - Weak ties: `weight <= median`
7
+ - Strong ties: `weight > median`
8
+
9
+ From the run output:
10
+ I run two removal orders on the LCC:
11
+ 1. weakest to strongest
12
+ 2. strongest to weakest
13
+
14
+ After each edge removal, the LCC is recomputed and recorded (fraction removed vs. LCC size). This directly satisfies the rubric requirement to compare structural robustness under weak-first and strong-first deletions.
15
+ Edges are removed one by one. After every removal, the LCC is recalculated and its size is stored as node count. The x-axis is fraction of ties removed, and the y-axis is LCC size.
16
+ ## 2) Centrality, top papers, and correlation analysis
17
+ From the run output, the starting LCC is:
18
+ Centrality is computed on the LCC using:
19
+ - `1662` nodes
20
+ - `26134` edges
21
+
22
+ The code also prints exact weak/strong tie statistics:
23
+
24
+ - total number of ties in the LCC: `26134`
25
+ - weak-tie threshold (median weight): `0.6276`
26
+ - number of weak ties (`weight <= 0.6276`): `13067`
27
+ - number of strong ties (`weight > 0.6276`): `13067`
28
+
29
+ So both tie classification and total weak/strong counts are explicitly reported before the stepwise removal process.
30
+
31
+ ## Centrality, central papers, interpretation, correlation
32
+
33
+ Three centrality measures are computed on the LCC:
34
+ - Degree
35
+ - Closeness
36
+ - Betweenness
37
+
38
+ For each metric, top-10 papers are printed in `ID<TAB>Title` format. Correlation between ranking vectors is:
39
+ For each one, top-10 papers are listed in `ID<TAB>Title` format.
40
+
41
+ For correlation, I first convert centrality scores to ranking vectors and then compute Pearson correlation between rankings.
42
+
43
+ Results from the run:
44
+ | Metric | Degree | Closeness | Betweenness |
45
+ |---|---:|---:|---:|
46
+ | Degree | 1.0000 | 0.9361 | 0.8114 |
47
+ | Closeness | 0.9361 | 1.0000 | 0.7684 |
48
+ | Betweenness | 0.8114 | 0.7684 | 1.0000 |
49
+
50
+ Lowest-correlation pair: **Closeness vs Betweenness (`0.7684`)**.
51
+ - Degree vs Closeness: `0.9361`
52
+ - Degree vs Betweenness: `0.8114`
53
+ - Closeness vs Betweenness: `0.7684` (lowest)
54
+ Interpretation: closeness captures global proximity, while betweenness captures bridge roles on shortest paths. A node can be globally near many others without being a major bridge, so these rankings diverge more than the other pairs.
55
+ The output explicitly reports the lowest-correlation pair.
56
+ Papers repeatedly appearing across top lists (e.g., `ahuja-etal-2023-mega`, `ding-etal-2020-discriminatively`, `qin-etal-2023-chatgpt`) indicate robust influence across multiple centrality notions.
57
+ Lowest pair interpretation:
58
+ ## 3) Optional extra credit: theme shift before vs after 2023
59
+ - closeness measures overall proximity in the graph
60
+ - betweenness measures bridge role on shortest paths
61
+ - these are related but different structural roles, so their rankings are less aligned
62
+ I split papers into two periods (before 2023, and 2023+), build text from title+abstract, use one shared vocabulary, train LDA for both periods, and compare topic vectors by cosine similarity.
63
+ Repeatedly central papers across top lists include:
64
+ Output evidence:
65
+ - `ahuja-etal-2023-mega`
66
+ - `ding-etal-2020-discriminatively`
67
+ - `shin-etal-2020-autoprompt`
68
+ - `weller-etal-2020-learning`
69
+ - `qin-etal-2023-chatgpt`
70
+
71
+ The code also explicitly prints papers that appear in multiple metric top-10 lists (with metric names), which strengthens the evidence for identifying robustly central papers.
72
+
73
+
74
+ ## Optional Extra Credit (50%): Theme shift before and after 2023
75
+
76
+ I compare two time groups: before 2023 and 2023+.
77
+
78
+ Steps used:
79
+
80
+ 1. split papers by year
81
+ 2. create text from title + abstract
82
+ 3. tokenize and clean
83
+ 4. build one shared vocabulary
84
+ 5. train LDA for each period
85
+ 6. extract topic-term matrices `D` (before) and `S` (after)
86
+ 7. compare topics with cosine similarity and rank by shift score
87
+
88
+ Run evidence:
89
+
90
+ - `D` shape: `(5, 5000)`
91
+ - `S` shape: `(5, 5000)`
92
+
93
+ Examples from output:
94
+
95
+ - emerging: `After Topic 2 | shift=0.1989 | llms, large, data, tasks, knowledge, reasoning, generation, performance`
96
+ - disappearing: `Before Topic 4 | shift=0.1912 | question, knowledge, event, performance, questions, task, graph, can`
97
+
98
+ This indicates a stronger LLM/reasoning focus in the later period.
99
+
100
+ ## Results (from current execution)
101
+
102
+ - Network loaded successfully; LCC size is `1662` nodes and `26134` edges.
103
+ - Weak/strong tie section reports:
104
+ - total ties: `26134`
105
+ - median-weight threshold: `0.6276`
106
+ - weak ties: `13067`
107
+ - strong ties: `13067`
108
+ - Centrality ranking correlations:
109
+ - Degree-Closeness: `0.9361`
110
+ - Degree-Betweenness: `0.8114`
111
+ - Closeness-Betweenness: `0.7684`
112
+ - Lowest-correlation pair: Closeness vs Betweenness.
113
+ - Top-10 central papers were produced for all three metrics in `ID<TAB>Title` format.
114
+ - Repeated papers across multiple centrality top-10 lists are explicitly reported.
115
+ - Topic-evolution matrices were produced:
116
+ - `D` (before 2023): `(5, 5000)`
117
+ - `S` (2023+): `(5, 5000)`
118
+ - Highest-shift emerging topic: After Topic 2 (`shift=0.1989`) with keywords around `llms`, `reasoning`, and `generation`.
119
+ - Highest-shift disappearing topic: Before Topic 4 (`shift=0.1912`) with keywords around `question`, `knowledge`, and `graph`.
120
+ - Topic matrices: `D` (before) = `(5, 5000)`, `S` (2023+) = `(5, 5000)`
121
+ ## Findings
122
+ Conclusion: post-2023 topics shift toward LLM- and reasoning-centered themes, while earlier topics are more question/knowledge/graph-oriented.
123
+ - The centrality rankings are strongly related overall, but not identical.
124
+ - Degree and closeness are most aligned (`0.9361`), indicating that papers with strong local connectivity are often globally well-positioned.
125
+ - Closeness and betweenness are least aligned (`0.7684`), showing that global proximity and bridge-role influence capture different node functions.
126
+ - Repeated appearance of papers such as `ahuja-etal-2023-mega`, `ding-etal-2020-discriminatively`, and `qin-etal-2023-chatgpt` across multiple lists suggests robust influence across different centrality definitions.
127
+ - Topic-shift outputs indicate post-2023 movement toward LLM-oriented and reasoning-heavy themes.
128
+ - Overall, the network remains highly connected at baseline, and the analysis pipeline covers connectivity, influence, and temporal theme evolution in a consistent way.
assignment_sc_2/rubric_points_explanation.pdf ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ %PDF-1.4
2
+ %���� ReportLab Generated PDF document (opensource)
3
+ 1 0 obj
4
+ <<
5
+ /F1 2 0 R /F2 3 0 R /F3 5 0 R
6
+ >>
7
+ endobj
8
+ 2 0 obj
9
+ <<
10
+ /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
11
+ >>
12
+ endobj
13
+ 3 0 obj
14
+ <<
15
+ /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
16
+ >>
17
+ endobj
18
+ 4 0 obj
19
+ <<
20
+ /Contents 12 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 11 0 R /Resources <<
21
+ /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
22
+ >> /Rotate 0 /Trans <<
23
+
24
+ >>
25
+ /Type /Page
26
+ >>
27
+ endobj
28
+ 5 0 obj
29
+ <<
30
+ /BaseFont /Courier /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font
31
+ >>
32
+ endobj
33
+ 6 0 obj
34
+ <<
35
+ /Contents 13 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 11 0 R /Resources <<
36
+ /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
37
+ >> /Rotate 0 /Trans <<
38
+
39
+ >>
40
+ /Type /Page
41
+ >>
42
+ endobj
43
+ 7 0 obj
44
+ <<
45
+ /Contents 14 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 11 0 R /Resources <<
46
+ /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
47
+ >> /Rotate 0 /Trans <<
48
+
49
+ >>
50
+ /Type /Page
51
+ >>
52
+ endobj
53
+ 8 0 obj
54
+ <<
55
+ /Contents 15 0 R /MediaBox [ 0 0 595.2756 841.8898 ] /Parent 11 0 R /Resources <<
56
+ /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
57
+ >> /Rotate 0 /Trans <<
58
+
59
+ >>
60
+ /Type /Page
61
+ >>
62
+ endobj
63
+ 9 0 obj
64
+ <<
65
+ /PageMode /UseNone /Pages 11 0 R /Type /Catalog
66
+ >>
67
+ endobj
68
+ 10 0 obj
69
+ <<
70
+ /Author (\(anonymous\)) /CreationDate (D:20260212152818-08'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20260212152818-08'00') /Producer (ReportLab PDF Library - \(opensource\))
71
+ /Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False
72
+ >>
73
+ endobj
74
+ 11 0 obj
75
+ <<
76
+ /Count 4 /Kids [ 4 0 R 6 0 R 7 0 R 8 0 R ] /Type /Pages
77
+ >>
78
+ endobj
79
+ 12 0 obj
80
+ <<
81
+ /Filter [ /ASCII85Decode /FlateDecode ] /Length 1311
82
+ >>
83
+ stream
84
+ GauHL>ArL\'RoMS300IBA`?rd[6)fS[j-kh%spUrQbJhp^tU-[3mLRafC,t3BM.OAUacGn!ljDNhmn.n!?WE%]]LNUDfb^KR/dP$^&epA,FEt7]D:!,=(K'$$)U90A-C=?SWYPGn9=Z8Dn_P'/)5[q2T;;T\7#AJVDenb]q+m&PmRTfX'J8f_qeg.d#';*WXgf`(ZmqrK7Dn@PgLmA(d5Q_KP,NE)435?eN]!VHu,#]eJ]-7%FAhr?Z0kViSPO"Q!t@Cbr6bJ.DF3H_-*E.f\'e)#0ssTOVmph5^_s5%gTa#'rVQD?If91=euOT'C@_$"BMYT#'^6-5-PAp;O"t_CBe"_f,J9`pSN*68SS>=5:q+3F\]t"d.k8#P,?R\Gd9'q:Y;m>[-+FKbq$^JA(G).E(U=*%973N[)%gfK8Ho:%H.IeL1=(u&Dl_,#79btP["2:)%1OJ.*rt(XbATCMLZT<&>!rClsRq9]L(t%_f6]NOtA\rXugs6BI%@OU0-:)fnd!/B4GW8]FJo%hj]&XiTIAHJ([24!17'm?of(]+EJ&)5Ik9^ZQHZZ30#Qu_K]UcIKA,2J)6'3<.L''-6!M%f>aNDP(%u,5sKfspkUeNr`BAHEj>nm:$`f1^9JoCAo@bcfDl?iS-S8.6:9J%Er_G.e4Ps-Z8?Sm6EOU$\pWm@8JidW$M,ioG<#Q5<bYGEqkL),/9^m9'RoGfbr6tnIEG=]9[tmbZCD\U>BE49AJ)sK@q62''1U&\el6,#H;]-#2A^5,VD-&m*j=fQY/]WsdTjs1&?AQ>m9Q0JXgQj<@;W3_*5kH[X-[LA3e,]^8irp9>-t;I<Q@n1:s/&`dh1\<eR<([1/-S+">3DoB]Hb&9TgYTEHY.l@=/)$nQ7&&6GggB@M^H$@n.A>$99BbaQ1ghP9]*QMafr`H%j5X*jLH-qu"O0do#X)n_>Q,I>c:b/:7-d%;hUO5:%!ReuC;<fXgoS_Oh#`QiPlT#a,Rg9ZEgV&\chi4U>-!pGrbjf0o?TD?0&,C#XY#\cC;p/Cp_$*jdMNm6p?H`#Tuo1]o#qPLP/6huX#aD^Sq;%<4ePVpX-gC8lDdBlYK,pTW+U`;<cS6E#n.;L(QtL9Gi>/hOZ.Di4AD!Db5$(Pb)oC*Btj=g41<#@0-<=hWo+42.'OC+42S1_.rbfOPg*S\+XDXjSf%Pb\Ia]*[QSs%q/_:BM9s<[bRpanjb]>Y)sDa."aq=MdXWkPMe+'1Xh/H]+KUTo1h2g^D2.Pc0(jJir'MP8S"Fn`q!8K*C^DRR8%W;=ZeQ,:>u8iTbn?>l3+~>endstream
85
+ endobj
86
+ 13 0 obj
87
+ <<
88
+ /Filter [ /ASCII85Decode /FlateDecode ] /Length 1420
89
+ >>
90
+ stream
91
+ GauHLgMZ%0&:Ml+bY%tL)gb$0'bgh?h$jp'Mf';ZoSY8[D/OJ(8];"_O7/19't$U_7R`gi&JJ*5F,"`r,Qih9RJED=!*;hgGg1cLYhT6M(HsCb_O'B;h&/;u%,F@>\-[d9$l1B#'70-6[FZDiP1]%+X>`SS8sl__'V3/4#:r+'WODsKZ1ER,]2r&)okMLXdok;2B:k&b@YI_Z_o1r#7/R]0nY[#16FB=b/?*l#Ds$-fDDEu`8HqGPj6KN)Tfc5(fl\kHdqf83KbpEIAkk.NZp>iY'aOu0$?P,?gjD"RB"rGKj_n5Vi7Gm(>SRO6>*Q!iN,oJm&PA`c&@)"!kA*gq/HcigicTAE2mD7-.f*e#O+Ns*.?PBN4bBJd.#Bp>[/,tf>I_^IeXCoOKA'a&Dj=%t&ADR&A6fY254Y5G]DXoG,&KFR"="uWVFKHeP\m*>Cd_QD%NsKjdK5OmL59m9$.0q'SLF3jhbV*kJ`jg2"J6;TNft;b%k*f'+3_;L;=VLU00`ciO;KIk;#),si#2FrT5c@mC!bH;)S[`GJg<B7qAqIt3:oWAB9@E^1P;&LhZ>Z8UD'CDHFV!W?kA$s%Ih_+2=NmkoH*E4pDb`W#9sC4(f]krC-G!(Dl`,XW4SBl\T2uJ8.4kP"Ou*(V`*2n9C$-ZPP6F`[.-%p:O2])T8DLVXJkkAe`tqAC<ZZ=LA`M=R1Mb"qp1F/:>IDbRXboC_)'LDO-J*f/Z:2aL=Ga\%pCE=%PB<dVXKS^G,N4VC-WXZ,^:#\7\DNp/=NK_7W<X*p@DUqk9X';B79-3R*&HFU7/Ge=qo>>qLOu:/<L:hS$s,HZQ&Xp(S-(q1mRQVcmsFbCVfToaebH%/;9nT;8@>uTlq1Y_[B5TG]Hj5ek%]mhW,ZUqDBR4c8Q[V,`"`9d:TDT4Z<k4(EdM,hinR?=kar'[*)!'D_'$`I,^3C9$\S.<1_8[leht);(=a7akn1W1/U[rq\sUnOVHMPdH5,)kT1\Z90_kd"[a5Io1?N/J#%.cCW5(O>]3s.2B=147OR+.:$%U<D"a*`H-TtJW+&b+]C3"L[%J5*J6jppT%rjN5VHt'D!>K^-jsM@$&Ek*M=%Pr:^j9;FMt=q1G+Ef'+eYPM;8V!baW7W<,HJqMiD#"Y[jPsZ>^fS/%6j/Zt4AT*MT4<5L=8,+>G.&PuA848/j%[=sQDe(5b'#0$E[UI^h]/MX"rbkA#5=W(K^IX7H_Oms*FLLDdK?XL@u&V3?XYcZ?0CF-d#UW;,LP?d+=Ph+I@AQ<S#l(6"JoB&@'&j0+I4,'iR:7dEDCoAtNW_'ado=%#/iITi:L!]dYVdLgl%@#`)1c+R#d+a")P3%Vj,,=)p2cf<sN@<uGO[WG;HHYW5deCq'C)`hsY^hSZ0Yo`gZiTb(Vk4T5/rWWu8:(@~>endstream
92
+ endobj
93
+ 14 0 obj
94
+ <<
95
+ /Filter [ /ASCII85Decode /FlateDecode ] /Length 1480
96
+ >>
97
+ stream
98
+ GauI7?#T!f&;KZN/*<#_dPaie+RFm5O0d8,S0RDMdbaW99a:*M`&Y8)b%cG(NY<g_:V/>I77]!5j7+o]IV3I$*kb\S2ou)<VJu2I"6RLC"t[Cq@6/s8e$Et*Cbmj;0;I0\)%<:V-&\OcIRdalp"*r-7Ph988a5D`1SNDoBW(1P)_<jG)Un4:9`K_/&(\Z@oQn^)?t\Wh!PrZhHcpncFhV-eO1e`7cG7MJ_n]]R]$,hYP*`+(4K`+P>WEA%j+.&.+P%8U;902u_.Nj+-$(cWA-bC$J(-sX`3du^nSrY:)r85Z_s!%a@Nk_?i'duEL:hK@)'4PH[c._-i]L6c90j=!HGL>nn0A\H=Cka>*Ol<oE=H*^#+`Z-\0ta*G7?@-!-tjMg0d7E%TGr57FXs&8h7W2Bb5GY>@Z$T/S-jP(g2aW_P#$pk=1:22;H#8mh(Bl+3s35]Vc!^7Y;c-*MAJCP@5t`G<M(68ZCB.CKLnLJ4+-\F&q@R;H9*,9rrTEZkBAT@V.kQj9l350DPS$0o4kU!Q2Dlg=N>)HkIkD#s+<]6.E?+0KptD%rBl:aF,"_Aac2?QkIH(Q\.I14O8)jb^k6F-GM([I5`K2C^DA&8]i^s3p,4qB:bG-#tA3iAS'$,394C*C0OQuggcIfTYgGeO,[>M1Ij;=`I&*G<YStWjb.[8T11G^nS$oZ9HUSlf6ncER:@=:DKm63f1UFu/EA:ue39cO8'?L]lba\M<87a^oXE$4;6:ufc8k9-#M4.MhEsEGhCUd*SkFe"AP$`QmCf"#`baEH^2b=%hY^EJY+;id/ptOQoO6&trKi(]k;MX=EiA!ZU(<)RCDF:WGL35D6"fk^/qaF+U`uL]<s&\BDe1Z$gmUo#?bKtUFWa7e#D-)$jcrQSVAHdZk6ouh57!dGn(.r>ea^)e#Bc:L:!-Hd5uK.#5luGRUp=u[&[K2j&[K1kI^j^mgGhm?^S"_$[=h6EP^6\B-.GNN=d<RaT3u\d9+;)Jq@mKV76VD@[:,q%_iIn2Jk^+L&4:s>nNPU7EEF"VM0sU!A1jG&W2IMN7\:BtPfdZpdbmVCi/>1dfLWn99;]a(g`1j]'**_lJT=Ef0.p9RjpO3W//7B'bgJ5meG_.B_"1?PkBFmL;*Jm7`<%\23q/Y)A,OUjZeJ[O7-k&6_)a*d/X6X]1OJDm-SAW_kbt=3X.(4J`F(&i%+\Q9mVo<?05`SFWO:4/m]8C(QikqHKS+C_NXaSL'g9KE9/^8'5n_iW1kTr%N0dB>(N((L>i8@"+O'/_h'R/2NUk'F(5%bX222o^Gg`f$ma.+RH[dmI8E,%:LP:r;()QToBlms62nhU>G*GUr<C4Bf^"24H<nBQh*:5=[h.6P7="gZLF7X<rIFGk)Fn</XN35r"`kbO"#oj\nEmu83nJ+]Df:Hkc`"OFG*HLH2"eG1m*jeOV)P:V.;p0*_e*sol6pfa4Dd#=%mQ:P`rrT.HoW8~>endstream
99
+ endobj
100
+ 15 0 obj
101
+ <<
102
+ /Filter [ /ASCII85Decode /FlateDecode ] /Length 1219
103
+ >>
104
+ stream
105
+ GauI69lK&M&;KZL'm"J#b8'-B-pS=!DAPI3=D[;u0=3R!\=lHIV!Rs8G:;.M\naBbMZs36J/WJ'5/VZf0F6ou28>=fi55nNqS[nQYf_L@-8Sae_/HBdQTKAH>XXcE*5r>PMtr%FMjWY_?()NeP>(BH=I4o>8f4\`Ed81rYH[l>LOSLLGR2XVGe('4bcfT.&)6G7pLpBROT`2EZ?APb.(M,X5mob2#aNeOSRD<hY75k>.s(Co11u$'hlCNMSMVDZ_<<,F7iTZ/Bo9c@]"WZiJb^/F@2Q09#H3R;6!\T5]"^dpQhBC.FY.'GUFrtl/?GK))e&*l]!C3^?D)Fl/O\_$9/O1U3B2M'+5rR1Tn2>3'Bf%efuU+KrAg[<%rj\:^5&7u"[(goVbLoW)dTKYUb$@%0a<9cC.4%&#ImBe_F?O81u(#\qfOZEm!SU=A:$oI'A6>e?<,'@O7%e3?Wt+K1O*&E*UcWq#s)\YNom[r4,I[34k,Q4?5:BBp71&-!.-#BZ([I$2&NY(.9Yd\Ti/a<?f0s-Nj!A1^5W)9>Nt<qj5!NS,cI6l;Rek,Y+E=E.PHBr7]TeB-q`n]"hg`BO`Cl"g4@Z^6qUBH_W?;gc>UEZhU!hFbI>N-c12HfK@l^C6u#A4?s=&9[LNR=XpDkK6c)DW?*IgWN5;-QK\XJS<l#eU]L$,_SnU3:e-$*kXV<*WC^G=;/O#.@UJm7I]ck=@\soN).W'n./kh$#B(+[LF*=n9s*CD3N&TsDLoR4$ad^Ub?7m:2-d9$G]8gmSb!0;5(8X7/c+6B+lNB[Mm-k@IaVn/G8"E_aVJN6Q14&jHIuI<ZXB(2@C^4@>/Z(I9Sq_kPpQKuWGoBu/r$RZJqeg8qSpT-O4!$ps-O"HU?IW>-J<%gg^FFgJF+1K2msNrh0VEb;0a(V+n_U"qT=[3mK^(.T:j)>dWgO\ujXk946m\hP_E,Gc4@fF,d@6S>I]C[:r`#DO1Nf*`b_-U]M`&,S=MrCmTAu5mH$_X=g/U!r3>`(9bK4oGS7FlhH5.#%Vg$plnFZ?CTT,TV?Y):ih7G!^ODjmC*>C^D:O)3]oENM&-b70nooAJ@I6LssU^.YpL,RAR[cBbX.-Pe@7X6[eO2A_f>EhH_qqghK\DhOIKnW,ubpSrFX*-H!W,Kc;dpYA?J7YQk$S>79r=:1gE1`S"G.odp^)t.;)DA9Y(8gW/(BA[Z!.BF"hu~>endstream
106
+ endobj
107
+ xref
108
+ 0 16
109
+ 0000000000 65535 f
110
+ 0000000061 00000 n
111
+ 0000000112 00000 n
112
+ 0000000219 00000 n
113
+ 0000000331 00000 n
114
+ 0000000536 00000 n
115
+ 0000000641 00000 n
116
+ 0000000846 00000 n
117
+ 0000001051 00000 n
118
+ 0000001256 00000 n
119
+ 0000001325 00000 n
120
+ 0000001606 00000 n
121
+ 0000001684 00000 n
122
+ 0000003087 00000 n
123
+ 0000004599 00000 n
124
+ 0000006171 00000 n
125
+ trailer
126
+ <<
127
+ /ID
128
+ [<7b8e9ff53cd6975d4b04d04316e200e7><7b8e9ff53cd6975d4b04d04316e200e7>]
129
+ % ReportLab generated PDF document -- digest (opensource)
130
+
131
+ /Info 10 0 R
132
+ /Root 9 0 R
133
+ /Size 16
134
+ >>
135
+ startxref
136
+ 7482
137
+ %%EOF
code/attribution_eval.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def return_prompts_attribution(reference_full_text, generated_summary, subclaims_json, difficulty_level):
2
+ return f'''
3
+ ### **SYSTEM / ROLE INSTRUCTION**
4
+
5
+ You are a **medical factuality and attribution evaluator**.
6
+ You will assess whether **unsupported subclaims** in a generated summary (those with `"result": 0"`) are *reasonable additions* based on the readability level (*easy / intermediate / hard*).
7
+
8
+ The goal is to determine whether these **extra pieces of information** are acceptable simplifications or *hallucinations* that reduce factual faithfulness.
9
+
10
+ ---
11
+
12
+ ### **READABILITY & ATTRIBUTION GUIDELINES**
13
+
14
+ | Level | Audience | Content Goal | Allowable Additions |
15
+ | :--------------- | :------------------------------- | :--------------------------------------------------------------------- | :--------------------------------------------------------------------------------- |
16
+ | **Easy** | General public | Simplify and clarify events | Allow general background info or lay explanations, but not new facts or diagnoses. |
17
+ | **Intermediate** | Educated layperson / med student | Add brief clarifications or causal context if consistent with the text | Allow inferred, non-contradictory context; avoid adding unconfirmed data. |
18
+ | **Hard** | Medical professional | Maintain factual precision | No additions; everything must be supported by source text. |
19
+
20
+ ---
21
+
22
+ ### **INPUT FIELDS**
23
+
24
+ **Reference full text:**
25
+ {reference_full_text}
26
+
27
+ **Generated summary ({difficulty_level}):**
28
+ {generated_summary}
29
+
30
+ **Subclaims and results:**
31
+ {subclaims_json}
32
+
33
+ ---
34
+
35
+ ### **TASK INSTRUCTIONS**
36
+
37
+ 1. Focus only on subclaims with `"result": 0"` (not supported by the input text).
38
+ 2. For each unsupported subclaim:
39
+
40
+ * Judge whether adding it is **reasonable** for the given readability level.
41
+ * Choose one of: `"reasonable addition"`, `"unnecessary but harmless"`, `"misleading / hallucinated"`.
42
+ * Provide a **1–2 sentence justification** explaining your reasoning.
43
+ 3. After all evaluations, assign a **numerical attribution score (0–5)**:
44
+
45
+ * **5** = All additions are reasonable or harmless simplifications.
46
+ * **4** = Mostly reasonable; minor harmless additions.
47
+ * **3** = Some misleading or unjustified additions.
48
+ * **2** = Many factual inaccuracies.
49
+ * **1** = Serious hallucinations; distorts source meaning.
50
+ * **0** = Highly unfaithful; mostly invented content.
51
+ 4. End with an **overall explanation (3–5 sentences)** summarizing your reasoning and suggestions.
52
+
53
+ ---
54
+
55
+ ### **OUTPUT FORMAT (strict JSON)**
56
+
57
+ ```json
58
+ {{
59
+ "evaluation_table": [
60
+ {{
61
+ "id": <subclaim_id>,
62
+ "subclaim": "<text>",
63
+ "evaluation": "<reasonable addition | unnecessary but harmless | misleading / hallucinated>",
64
+ "explanation": "<short justification>"
65
+ }}
66
+ ],
67
+ "attribution_score": <0-5>,
68
+ "overall_explanation": "<concise summary of your judgment>"
69
+ }}
70
+ ```
71
+ '''
72
+ from openai import OpenAI
73
+ import json
74
+ file_path = "/home/mshahidul/api_new.json"
75
+ with open(file_path, "r") as file:
76
+ api_keys = json.load(file)
77
+
78
+ openai_api_key = api_keys.get("openai")
79
+
80
+ client = OpenAI(api_key=openai_api_key)
81
+ def openai_return(prompt):
82
+ response = client.chat.completions.create(
83
+ model="gpt-5-mini",
84
+ messages=[
85
+ {"role": "system", "content": "You are a helpful assistant."},
86
+ {"role": "user", "content": prompt}
87
+ ]
88
+ )
89
+ cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
90
+ return json.loads(cleaned_response)
91
+
92
+
93
+ import json
94
+ file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"
95
+
96
+ with open(file_path, 'r') as f:
97
+ synthetic_data = json.load(f)
98
+
99
+ file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"
100
+
101
+ with open(file_path_qwen3_32B, 'r') as f:
102
+ qwen3_32B_results = json.load(f)
103
+
104
+ # dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions'])
105
+ # print(f"Full text: {synthetic_data[0]['full_text']}")
106
+ import os
107
+
108
+ res=[]
109
+ temp=""
110
+ save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5_attribution.json"
111
+ if os.path.exists(save_path):
112
+ with open(save_path, 'r') as f:
113
+ res = json.load(f)
114
+ print(f"Resuming from {len(res)} entries")
115
+ existing_check=set((entry['id'], entry['difficulty_level']) for entry in res)
116
+ import tqdm
117
+ for ind in tqdm.tqdm(range(len(res),100)):
118
+ for version in ["easy", "intermediate", "hard"]:
119
+ if (synthetic_data[ind]['id'], version) in existing_check:
120
+ print(f"Skipping {synthetic_data[ind]['id']}, {version}")
121
+ continue
122
+ ref_full_text_summary = (f"{synthetic_data[ind]['full_text']}")
123
+ generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}")
124
+ subclaims_results = (f"{qwen3_32B_results[ind]['attribution']['results']}")
125
+ prompt = return_prompts_attribution(ref_full_text_summary, generated_summary, subclaims_results, version)
126
+ try:
127
+ ans=openai_return(prompt)
128
+ res.append({
129
+ "id": synthetic_data[ind]['id'],
130
+ "difficulty_level": version,
131
+ "response": ans
132
+ })
133
+
134
+ if len(res)%2==0:
135
+ print(f"Completed {len(res)} out of 300")
136
+ with open(save_path, 'w') as outfile:
137
+ json.dump(res, outfile, indent=2)
138
+ except Exception as e:
139
+ print(f"Error at index {ind}, version {version}: {e}")
140
+
141
+ with open(save_path, 'w') as outfile:
142
+ json.dump(res, outfile, indent=2)
code/attribution_evalV2.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
3
+ os.environ["CUDA_VISIBLE_DEVICES"] = "4"
4
+
5
+ import json
6
+ import torch
7
+ from unsloth import FastLanguageModel
8
+ import tqdm
9
+
10
+
11
+ _model_cache = {"model": None, "tokenizer": None}
12
+
13
+ def load_finetuned_model(model_path: str):
14
+ """Load and cache the fine-tuned model + tokenizer."""
15
+ if _model_cache["model"] is not None:
16
+ return _model_cache["model"], _model_cache["tokenizer"]
17
+
18
+ model, tokenizer = FastLanguageModel.from_pretrained(
19
+ model_name=model_path,
20
+ max_seq_length=8192,
21
+ load_in_4bit=False,
22
+ load_in_8bit=False,
23
+ full_finetuning=False,
24
+ )
25
+ _model_cache["model"], _model_cache["tokenizer"] = model, tokenizer
26
+ return model, tokenizer
27
+
28
+
29
+ def build_inference_prompt(
30
+ reference_full_text,
31
+ generated_summary,
32
+ subclaim_id,
33
+ subclaim_text,
34
+ subclaim_result,
35
+ difficulty_level
36
+ ):
37
+ """
38
+ Build a standardized inference prompt for single‑subclaim evaluation.
39
+ Use after fine‑tuning to assess new examples consistently.
40
+ """
41
+
42
+ inference_prompt = f"""
43
+ ### **SYSTEM / ROLE INSTRUCTION**
44
+
45
+ You are a **medical factuality and attribution evaluator**.
46
+ You will analyze one subclaim from a generated medical summary.
47
+
48
+ Each subclaim includes a `"result"` flag:
49
+ - `1` → Supported by the reference text (no reasonableness check required)
50
+ - `0` → Unsupported by the reference text (evaluate scope and validity)
51
+
52
+ Your task is to decide, for unsupported subclaims, whether the new information
53
+ is a *reasonable addition* given the specified readability level:
54
+ **easy**, **intermediate**, or **hard**.
55
+
56
+ ---
57
+
58
+ ### **READABILITY GUIDELINES**
59
+
60
+ | Level | Audience | Style | Allowable Additions |
61
+ | :-- | :-- | :-- | :-- |
62
+ | **Easy (FH 70–100)** | General public | Simple, concrete | Broad clarifications only; no factual innovations |
63
+ | **Intermediate (FH 50–69)** | Educated nonspecialist | Moderate precision | Limited clarifications consistent with the text |
64
+ | **Hard (FH 0–49)** | Professionals | Formal, technical | Must be strictly supported by evidence |
65
+
66
+ ---
67
+
68
+ ### **INPUT**
69
+
70
+ Readability Level: {difficulty_level}
71
+
72
+ Reference Full Text:
73
+ {reference_full_text}
74
+
75
+ Generated Summary:
76
+ {generated_summary}
77
+
78
+ Subclaim Info:
79
+ {{
80
+ "subclaim_id": {subclaim_id},
81
+ "subclaim": "{subclaim_text}",
82
+ "result": {subclaim_result}
83
+ }}
84
+
85
+ ---
86
+
87
+ ### **TASK INSTRUCTIONS**
88
+
89
+ - If `"result": 1"`, respond with `"not_applicable"` and justify briefly
90
+ (e.g., *"supported, no evaluation required"*).
91
+ - If `"result": 0"`, classify reasonableness:
92
+ - `"reasonable"` → legitimate simplification consistent with the readability level
93
+ - `"partially_reasonable"` → benign rephrasing
94
+ - `"unreasonable"` → misleading, speculative, or contradicted by the source
95
+
96
+ Provide a **short 1–2 sentence justification**.
97
+
98
+ ---
99
+
100
+ ### **EXPECTED OUTPUT (JSON ONLY)**
101
+
102
+ ```json
103
+ {{
104
+ "evaluation": {{
105
+ "subclaim_id": {subclaim_id},
106
+ "subclaim": "{subclaim_text}",
107
+ "result": {subclaim_result},
108
+ "reasonableness": "<reasonable | partially_reasonable | unreasonable | not_applicable>",
109
+ "justification": "<brief justification>"
110
+ }}
111
+ }}
112
+ """.strip()
113
+
114
+ return inference_prompt
115
+ def infer_attribution_reasonableness(prompt: str, model_path: str):
116
+ """Run inference using the fine-tuned model with attribution prompt."""
117
+ model, tokenizer = load_finetuned_model(model_path)
118
+
119
+ messages = [{"role": "user", "content": prompt + "\n"}]
120
+
121
+ chat_text = tokenizer.apply_chat_template(
122
+ messages,
123
+ tokenize=False,
124
+ add_generation_prompt=True,
125
+ enable_thinking=False,
126
+ )
127
+
128
+ inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")
129
+
130
+ with torch.no_grad():
131
+ output_ids = model.generate(
132
+ **inputs,
133
+ max_new_tokens=150,
134
+ temperature=0.2,
135
+ top_p=0.8,
136
+ top_k=5,
137
+ do_sample=False,
138
+ )
139
+
140
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
141
+ if "</think>" in output_text:
142
+ output_text = output_text.split("</think>")[-1].strip().replace("```json", "").replace("```", "")
143
+
144
+ try:
145
+ parsed = json.loads(output_text)
146
+ except Exception:
147
+ parsed = output_text
148
+ return parsed
149
+
150
+
151
+ file_synth = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"
152
+ file_qwen_results = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"
153
+ save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/attribution_resonability_results_100_qwen3-32B_v2.json"
154
+
155
+ with open(file_synth, 'r') as f:
156
+ synthetic_data = json.load(f)
157
+ with open(file_qwen_results, 'r') as f:
158
+ qwen3_32B_results = json.load(f)
159
+ dict1={}
160
+ for item in qwen3_32B_results:
161
+ version=item['version']
162
+ dict1[(item['id'], version)] = item['attribution']['results']
163
+
164
+ res = []
165
+ if os.path.exists(save_path):
166
+ with open(save_path, 'r') as f:
167
+ res = json.load(f)
168
+ print(f"🔁 Resuming from {len(res)} entries")
169
+
170
+ existing = set((e["id"], e["difficulty_level"]) for e in res)
171
+
172
+ for ind in tqdm.tqdm(range(0, 100)):
173
+ entry = synthetic_data[ind]
174
+
175
+ for level in ["easy", "intermediate", "hard"]:
176
+ subclaims_results = dict1[(entry["id"], level)]
177
+ if (entry["id"], level) in existing:
178
+ print(f"⏭️ Skipping {entry['id']} ({level})")
179
+ continue
180
+
181
+ ref_full_text = entry["full_text"]
182
+ generated_summary = entry["readability_versions"][level]["text"]
183
+ temp=[]
184
+ for subclaim in subclaims_results:
185
+ subclaim_id = subclaim['subclaim']['id']
186
+ subclaim_text = subclaim['subclaim']['subclaim']
187
+ subclaim_result = subclaim['result']
188
+ prompt = build_inference_prompt(
189
+ ref_full_text,
190
+ generated_summary,
191
+ subclaim_id,
192
+ subclaim_text,
193
+ subclaim_result,
194
+ level
195
+ )
196
+ if subclaim_result=="1":
197
+ temp.append({
198
+ "subclaim_id": subclaim_id,
199
+ "subclaim_text": subclaim_text,
200
+ "response": "not_applicable"
201
+ })
202
+ continue
203
+ response = infer_attribution_reasonableness(prompt,"/home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1")
204
+ temp.append({
205
+ "subclaim_id": subclaim_id,
206
+ "subclaim_text": subclaim_text,
207
+ "response": response
208
+ })
209
+ res.append({
210
+ "id": entry["id"],
211
+ "difficulty_level": level,
212
+ "results": temp
213
+ })
214
+ if len(res) % 10 == 0:
215
+ with open(save_path, 'w') as f:
216
+ json.dump(res, f, indent=2, ensure_ascii=False)
217
+ print(f"💾 Saved after {len(res)} entries")
218
+
219
+ with open(save_path, 'w') as f:
220
+ json.dump(res, f, indent=2, ensure_ascii=False)
221
+
222
+
code/combine_docid_labels.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import argparse
4
+ import json
5
+ from collections import defaultdict
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+
10
+
11
+ EXPECTED_LABELS = (
12
+ "low_health_literacy",
13
+ "intermediate_health_literacy",
14
+ "proficient_health_literacy",
15
+ )
16
+
17
+
18
+ @dataclass
19
+ class MergeStats:
20
+ total_rows: int = 0
21
+ total_doc_ids: int = 0
22
+ missing_label_rows: int = 0
23
+ unexpected_labels: int = 0
24
+ doc_ids_missing_some_labels: int = 0
25
+ doc_ids_fulltext_mismatch: int = 0
26
+ doc_ids_summary_mismatch: int = 0
27
+ doc_ids_fulltext_subclaims_mismatch: int = 0
28
+ doc_ids_summary_subclaims_mismatch: int = 0
29
+
30
+
31
+ def _pick_first_non_empty(values: List[Optional[str]]) -> Optional[str]:
32
+ for value in values:
33
+ if isinstance(value, str) and value.strip():
34
+ return value
35
+ for value in values:
36
+ if value is not None:
37
+ return value
38
+ return None
39
+
40
+
41
+ def _normalize_text(value: Any) -> Optional[str]:
42
+ if value is None:
43
+ return None
44
+ if not isinstance(value, str):
45
+ return str(value)
46
+ return value
47
+
48
+
49
+ def _normalize_string_list(value: Any) -> Optional[Tuple[str, ...]]:
50
+ if value is None:
51
+ return None
52
+ if not isinstance(value, list):
53
+ return (str(value),)
54
+ normalized: List[str] = []
55
+ for item in value:
56
+ if item is None:
57
+ continue
58
+ if isinstance(item, str):
59
+ normalized.append(item.strip())
60
+ else:
61
+ normalized.append(str(item).strip())
62
+ return tuple(normalized)
63
+
64
+
65
+ def combine_by_doc_id(rows: List[Dict[str, Any]], keep_all_fields_per_label: bool = True) -> Tuple[List[Dict[str, Any]], MergeStats]:
66
+ stats = MergeStats(total_rows=len(rows))
67
+
68
+ grouped: Dict[int, List[Dict[str, Any]]] = defaultdict(list)
69
+ for row in rows:
70
+ if not isinstance(row, dict):
71
+ continue
72
+ doc_id = row.get("doc_id")
73
+ if doc_id is None:
74
+ continue
75
+ grouped[int(doc_id)].append(row)
76
+
77
+ stats.total_doc_ids = len(grouped)
78
+
79
+ combined: List[Dict[str, Any]] = []
80
+
81
+ for doc_id in sorted(grouped.keys()):
82
+ bucket = grouped[doc_id]
83
+
84
+ labels_map: Dict[str, Dict[str, Any]] = {}
85
+ fulltexts: List[Optional[str]] = []
86
+ summaries: List[Optional[str]] = []
87
+ fulltext_subclaims_sets: List[Optional[Tuple[str, ...]]] = []
88
+ summary_subclaims_sets: List[Optional[Tuple[str, ...]]] = []
89
+
90
+ for row in bucket:
91
+ label = row.get("label")
92
+ if not label:
93
+ stats.missing_label_rows += 1
94
+ continue
95
+ if label not in EXPECTED_LABELS:
96
+ stats.unexpected_labels += 1
97
+
98
+ fulltexts.append(_normalize_text(row.get("fulltext")))
99
+ summaries.append(_normalize_text(row.get("summary")))
100
+ fulltext_subclaims_sets.append(_normalize_string_list(row.get("fulltext_subclaims")))
101
+ summary_subclaims_sets.append(_normalize_string_list(row.get("summary_subclaims")))
102
+
103
+ label_payload: Dict[str, Any]
104
+ if keep_all_fields_per_label:
105
+ # Shared within a doc_id; keep them only once at top-level
106
+ label_payload = {
107
+ k: v
108
+ for k, v in row.items()
109
+ if k
110
+ not in (
111
+ "doc_id",
112
+ "label",
113
+ "fulltext",
114
+ "summary",
115
+ "fulltext_subclaims",
116
+ "summary_subclaims",
117
+ )
118
+ }
119
+ else:
120
+ label_payload = {
121
+ "diff_label_texts": row.get("diff_label_texts"),
122
+ "diff_label_subclaims": row.get("diff_label_subclaims"),
123
+ }
124
+
125
+ labels_map[str(label)] = label_payload
126
+
127
+ chosen_fulltext = _pick_first_non_empty(fulltexts)
128
+ chosen_summary = _pick_first_non_empty(summaries)
129
+
130
+ chosen_fulltext_subclaims: Optional[List[str]] = None
131
+ for items in fulltext_subclaims_sets:
132
+ if items:
133
+ chosen_fulltext_subclaims = list(items)
134
+ break
135
+ chosen_summary_subclaims: Optional[List[str]] = None
136
+ for items in summary_subclaims_sets:
137
+ if items:
138
+ chosen_summary_subclaims = list(items)
139
+ break
140
+
141
+ distinct_fulltexts = {t.strip() for t in fulltexts if isinstance(t, str) and t.strip()}
142
+ distinct_summaries = {t.strip() for t in summaries if isinstance(t, str) and t.strip()}
143
+ if len(distinct_fulltexts) > 1:
144
+ stats.doc_ids_fulltext_mismatch += 1
145
+ if len(distinct_summaries) > 1:
146
+ stats.doc_ids_summary_mismatch += 1
147
+
148
+ distinct_fulltext_subclaims = {t for t in fulltext_subclaims_sets if t}
149
+ distinct_summary_subclaims = {t for t in summary_subclaims_sets if t}
150
+ if len(distinct_fulltext_subclaims) > 1:
151
+ stats.doc_ids_fulltext_subclaims_mismatch += 1
152
+ if len(distinct_summary_subclaims) > 1:
153
+ stats.doc_ids_summary_subclaims_mismatch += 1
154
+
155
+ missing_some = any(lbl not in labels_map for lbl in EXPECTED_LABELS)
156
+ if missing_some:
157
+ stats.doc_ids_missing_some_labels += 1
158
+
159
+ combined.append(
160
+ {
161
+ "doc_id": doc_id,
162
+ "fulltext": chosen_fulltext,
163
+ "fulltext_subclaims": chosen_fulltext_subclaims,
164
+ "summary": chosen_summary,
165
+ "summary_subclaims": chosen_summary_subclaims,
166
+ "labels": labels_map,
167
+ }
168
+ )
169
+
170
+ return combined, stats
171
+
172
+
173
+ def main() -> None:
174
+ parser = argparse.ArgumentParser(
175
+ description=(
176
+ "Combine per-label rows into a single object per doc_id. "
177
+ "Input is a JSON array with repeated doc_id for different labels."
178
+ )
179
+ )
180
+ parser.add_argument(
181
+ "--input",
182
+ required=True,
183
+ help="Path to input JSON file (list of rows)",
184
+ )
185
+ parser.add_argument(
186
+ "--output",
187
+ default=None,
188
+ help="Path to output JSON file. Default: same folder with *_by_docid.json suffix",
189
+ )
190
+ parser.add_argument(
191
+ "--minimal",
192
+ action="store_true",
193
+ help="Only keep diff_label_texts/diff_label_subclaims/fulltext_subclaims/summary_subclaims per label.",
194
+ )
195
+
196
+ args = parser.parse_args()
197
+ input_path = Path(args.input)
198
+ output_path = Path(args.output) if args.output else input_path.with_name(input_path.stem + "_by_docid.json")
199
+
200
+ rows = json.loads(input_path.read_text(encoding="utf-8"))
201
+ if not isinstance(rows, list):
202
+ raise SystemExit("Input JSON must be a list")
203
+
204
+ combined, stats = combine_by_doc_id(rows, keep_all_fields_per_label=not args.minimal)
205
+
206
+ output_path.write_text(
207
+ json.dumps(combined, ensure_ascii=False, indent=2) + "\n",
208
+ encoding="utf-8",
209
+ )
210
+
211
+ print("Wrote:", str(output_path))
212
+ print(
213
+ "Stats:",
214
+ json.dumps(
215
+ {
216
+ "total_rows": stats.total_rows,
217
+ "total_doc_ids": stats.total_doc_ids,
218
+ "missing_label_rows": stats.missing_label_rows,
219
+ "unexpected_labels": stats.unexpected_labels,
220
+ "doc_ids_missing_some_labels": stats.doc_ids_missing_some_labels,
221
+ "doc_ids_fulltext_mismatch": stats.doc_ids_fulltext_mismatch,
222
+ "doc_ids_summary_mismatch": stats.doc_ids_summary_mismatch,
223
+ "doc_ids_fulltext_subclaims_mismatch": stats.doc_ids_fulltext_subclaims_mismatch,
224
+ "doc_ids_summary_subclaims_mismatch": stats.doc_ids_summary_subclaims_mismatch,
225
+ },
226
+ indent=2,
227
+ ),
228
+ )
229
+
230
+
231
+ if __name__ == "__main__":
232
+ main()
code/convert_awq.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # Set GPU environment variables
3
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
5
+ from awq import AutoAWQForCausalLM
6
+ from transformers import AutoTokenizer
7
+
8
+ # Paths
9
+ model_path = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-support-check-8b_ctx_v2-bf16"
10
+ quant_path = "/home/mshahidul/readctrl_model/full_model/qwen3-32B-subclaims-support-check-8b_ctx_AWQ"
11
+
12
+ # Quantization configuration
13
+ quant_config = {
14
+ "zero_point": True,
15
+ "q_group_size": 128,
16
+ "w_bit": 4,
17
+ "version": "GEMM"
18
+ }
19
+
20
+ # Load model and tokenizer
21
+ print("Loading model...")
22
+ model = AutoAWQForCausalLM.from_pretrained(model_path, device_map="auto")
23
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
24
+
25
+ # Quantize
26
+ print("Starting quantization (this may take a while)...")
27
+ # AutoAWQ uses a default calibration dataset (pile-val)
28
+ model.quantize(tokenizer, quant_config=quant_config)
29
+
30
+ # Save quantized model
31
+ print(f"Saving quantized model to {quant_path}...")
32
+ model.save_quantized(quant_path)
33
+ tokenizer.save_pretrained(quant_path)
34
+
35
+ print("Quantization Complete!")
code/finetune-inference/convert_fp16.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ # python /home/mshahidul/readctrl/code/finetune-inference/convert_fp16.py \
4
+ # --model_path /home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1
5
+ # --save_path /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged
6
+ # --cuda_device 2
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument("--model_path", type=str, required=True,
9
+ help="Path to the fine-tuned model/adapter to convert.")
10
+ parser.add_argument("--save_path", type=str, required=True,
11
+ help="Path to save the converted BF16 model.")
12
+ parser.add_argument("--msl", type=int, default=8192,
13
+ help="Maximum sequence length for the model.")
14
+ parser.add_argument("--cuda_device", type=str, default="2",
15
+ help="CUDA device index to use.")
16
+ args = parser.parse_args()
17
+
18
+ # Set your GPU visibility as you did in your script
19
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
20
+ os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device
21
+ import torch
22
+ from unsloth import FastLanguageModel
23
+
24
+ # -----------------------------
25
+ # CONFIGURATION
26
+ # -----------------------------
27
+ # Path to your current fine-tuned model/adapter
28
+ # MODEL_PATH = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx"
29
+
30
+ # Path where you want to save the BF16 version
31
+ # SAVE_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims_BF16_merged"
32
+
33
+ def convert_and_save():
34
+ print(f"Loading model from: {args.model_path}")
35
+
36
+ # 1. Load the model
37
+ # We explicitly set dtype=torch.bfloat16 to ensure the base is loaded correctly
38
+ # load_in_4bit must be False to allow for a clean 16-bit merge
39
+ model, tokenizer = FastLanguageModel.from_pretrained(
40
+ model_name=args.model_path,
41
+ max_seq_length=args.msl,
42
+ dtype=torch.bfloat16,
43
+ load_in_4bit=False,
44
+ )
45
+
46
+ print(f"Saving merged BF16 model to: {args.save_path}")
47
+
48
+ # 2. Save using save_pretrained_merged
49
+ # 'merged_16bit' will save as float16 or bfloat16 depending on the loaded dtype.
50
+ # Since we loaded with torch.bfloat16, this will save in bfloat16.
51
+ model.save_pretrained_merged(
52
+ args.save_path,
53
+ tokenizer,
54
+ save_method="merged_16bit",
55
+ )
56
+
57
+ print("Conversion complete. You can now use this path for vLLM or standard inference.")
58
+
59
+ if __name__ == "__main__":
60
+ convert_and_save()
code/interface/annotators_v5.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import os
4
+ from datetime import datetime
5
+
6
+ # --- PATH CONFIGURATION ---
7
+ # DATA_PATH = "/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_with_gs_summary_en_0_20.json"
8
+ DATA_PATH = "/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_bn_0_80.json"
9
+ SAVE_ROOT = "/home/mshahidul/readctrl/data/annotators_validate_data_Bangla_(0_80)"
10
+ os.makedirs(SAVE_ROOT, exist_ok=True)
11
+
12
+ # --- UI HTML COMPONENTS (Kept same as original) ---
13
+ GUIDE_HTML = """
14
+ <div style="background-color: #f9f9f9; padding: 15px; border-left: 6px solid #4CAF50; border-radius: 4px; margin-bottom: 20px;">
15
+ <h3>Rating Guide: Medical Text Difficulty</h3>
16
+ <table style="width:100%; border-collapse: collapse; text-align: left;">
17
+ <tr style="background-color: #e8f5e9;">
18
+ <th style="padding: 8px; border: 1px solid #ddd;">Score</th>
19
+ <th style="padding: 8px; border: 1px solid #ddd;">Description</th>
20
+ </tr>
21
+ <tr><td><b>1</b></td><td><b>Very Easy:</b> Simple words, no medical jargon.</td></tr>
22
+ <tr><td><b>2</b></td><td><b>Easy:</b> Conversational medical terms.</td></tr>
23
+ <tr><td><b>3</b></td><td><b>Moderate:</b> Standard patient education material.</td></tr>
24
+ <tr><td><b>4</b></td><td><b>Hard:</b> Significant technical jargon.</td></tr>
25
+ <tr><td><b>5</b></td><td><b>Very Hard:</b> Specialist-level / Academic.</td></tr>
26
+ </table>
27
+ </div>
28
+ """
29
+
30
+ EXAMPLES_HTML = """
31
+ <div style="background-color: #ffffff; padding: 15px; border: 1px solid #ddd; border-radius: 4px;">
32
+ <h3 style="color: #2e7d32;">Reference Examples</h3>
33
+ <div style="display: flex; gap: 15px;">
34
+ <div style="flex: 1; background-color: #f1f8e9; padding: 10px; border-radius: 4px;">
35
+ <h4>Level 1-2</h4>
36
+ <p>"She had a kidney problem... a big blood clot blocked veins in her brain."</p>
37
+ </div>
38
+ <div style="flex: 1; background-color: #ffebee; padding: 10px; border-radius: 4px;">
39
+ <h4>Level 4-5</h4>
40
+ <p>"Idiopathic NS inaugurated by cerebral venous thrombosis extended to the right jugular vein."</p>
41
+ </div>
42
+ </div>
43
+ </div>
44
+ """
45
+ def parse_diff_label_texts(raw_value):
46
+ """
47
+ Parse diff_label_texts that may be:
48
+ - dict (already parsed)
49
+ - JSON string
50
+ - Python-dict-like string (single quotes)
51
+ """
52
+ if isinstance(raw_value, dict):
53
+ return raw_value
54
+
55
+ if not isinstance(raw_value, str):
56
+ return {}
57
+
58
+ text = raw_value.strip()
59
+ if not text:
60
+ return {}
61
+
62
+ # Prefer strict JSON first; fall back to Python literal parsing.
63
+ try:
64
+ parsed = json.loads(text)
65
+ return parsed if isinstance(parsed, dict) else {}
66
+ except json.JSONDecodeError:
67
+ pass
68
+
69
+ try:
70
+ parsed = ast.literal_eval(text)
71
+ return parsed if isinstance(parsed, dict) else {}
72
+ except (ValueError, SyntaxError):
73
+ return {}
74
+ import ast
75
+ # --- DATA LOADING ---
76
+ def normalize_dataset(raw_dataset):
77
+ """
78
+ Normalize different dataset layouts into a flat queue where each item has:
79
+ index, id, label, generated_summary.
80
+ """
81
+ normalized = []
82
+
83
+ for item in raw_dataset:
84
+
85
+
86
+ # New layout: {"diff_label_texts": {label: text, ...}}
87
+ diff_label_texts = item.get("diff_label_texts")
88
+ if isinstance(diff_label_texts, dict):
89
+ for label, text in diff_label_texts.items():
90
+ normalized.append({
91
+ "index": item.get("index"),
92
+ "id": item.get("id"),
93
+ "label": label,
94
+ "generated_summary": text
95
+ })
96
+
97
+ else:
98
+ diff_label_texts = parse_diff_label_texts(item.get("diff_label_texts"))
99
+ for label, text in diff_label_texts.items():
100
+ normalized.append({
101
+ "index": item.get("index"),
102
+ "id": item.get("id"),
103
+ "label": label,
104
+ "generated_summary": text
105
+ })
106
+
107
+
108
+
109
+ return normalized
110
+
111
+
112
+ if os.path.exists(DATA_PATH):
113
+ with open(DATA_PATH, "r", encoding="utf-8") as f:
114
+ RAW_DATASET = json.load(f)
115
+ FULL_DATASET = normalize_dataset(RAW_DATASET)
116
+ print(len(FULL_DATASET))
117
+ assert FULL_DATASET, f"No valid items found in dataset: {DATA_PATH}"
118
+ else:
119
+ assert False, f"Data file not found at {DATA_PATH}"
120
+
121
+ # --- PERSISTENCE HELPERS ---
122
+ def get_user_dir(username):
123
+ clean_username = "".join([c for c in username if c.isalnum() or c in (' ', '_', '-')]).strip() or "anonymous"
124
+ return os.path.join(SAVE_ROOT, clean_username)
125
+
126
+ def save_state(user_dir, state_dict):
127
+ with open(os.path.join(user_dir, "state.json"), "w") as f:
128
+ json.dump(state_dict, f, indent=4)
129
+
130
+ def load_state(user_dir):
131
+ state_path = os.path.join(user_dir, "state.json")
132
+ if os.path.exists(state_path):
133
+ with open(state_path, "r") as f:
134
+ return json.load(f)
135
+ return None
136
+
137
+ # --- LOGIC FUNCTIONS ---
138
+ def get_current_ui_values(state):
139
+ """Helper to get UI values for the current index, including previous ratings if they exist."""
140
+ idx = state['current_index']
141
+ current_item = state['queue'][idx]
142
+
143
+ # Check if we already have a rating for this specific index
144
+ existing_rating = 3 # Default
145
+ for res in state['results']:
146
+ if res['queue_position'] == idx:
147
+ existing_rating = res['rating']
148
+ break
149
+
150
+ progress = f"Item {idx + 1} of {len(state['queue'])}"
151
+ return current_item['generated_summary'], progress, existing_rating
152
+
153
+ def start_session(username):
154
+ if not username:
155
+ gr.Warning("Please enter a username!")
156
+ return [gr.update()] * 5
157
+
158
+ user_dir = get_user_dir(username)
159
+ os.makedirs(user_dir, exist_ok=True)
160
+ existing_state = load_state(user_dir)
161
+
162
+ if existing_state:
163
+ gr.Info(f"Welcome back! Resuming from item {existing_state['current_index'] + 1}.")
164
+ state = existing_state
165
+ else:
166
+ state = {
167
+ "username": username,
168
+ "current_index": 0,
169
+ "queue": list(FULL_DATASET),
170
+ "results": [],
171
+ "completed": False
172
+ }
173
+ save_state(user_dir, state)
174
+
175
+ text, progress, rating = get_current_ui_values(state)
176
+ return (gr.update(visible=False), gr.update(visible=True), text, progress, rating, state)
177
+
178
+ def submit_rating(doc_slider, state):
179
+ if state is None: return "", "Error", 3, 3, None
180
+
181
+ user_dir = get_user_dir(state['username'])
182
+ idx = state['current_index']
183
+ current_item = state['queue'][idx]
184
+
185
+ # Update existing rating if editing, otherwise append
186
+ new_result = {
187
+ "queue_position": idx,
188
+ "index": current_item.get('index', idx),
189
+ "doc_id": current_item.get('id', current_item.get('index', 'no_id')),
190
+ "label": current_item.get('label', 'no_label'),
191
+ "rating": doc_slider,
192
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
193
+ }
194
+
195
+ # Logic to overwrite existing rating for this index
196
+ state['results'] = [r for r in state['results'] if r['queue_position'] != idx]
197
+ state['results'].append(new_result)
198
+ state['results'].sort(key=lambda x: x['queue_position']) # Keep sorted
199
+
200
+ if idx + 1 < len(state['queue']):
201
+ state['current_index'] += 1
202
+ save_state(user_dir, state)
203
+ # Save results file
204
+ with open(os.path.join(user_dir, "annotation_results.json"), "w") as f:
205
+ json.dump(state['results'], f, indent=4)
206
+
207
+ text, progress, rating = get_current_ui_values(state)
208
+ return text, progress, rating, state
209
+ else:
210
+ state['completed'] = True
211
+ save_state(user_dir, state)
212
+ return "✅ ALL TASKS COMPLETED", "Status: Finished", 1, state
213
+
214
+ def go_back(state):
215
+ if state is None or state['current_index'] <= 0:
216
+ gr.Warning("Already at the first item.")
217
+ return [gr.update()] * 3 + [state]
218
+
219
+ state['current_index'] -= 1
220
+ text, progress, rating = get_current_ui_values(state)
221
+ return text, progress, rating, state
222
+
223
+ # --- UI INTERFACE ---
224
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
225
+ session_state = gr.State()
226
+
227
+ gr.Markdown("# Medical Text Readability Annotation")
228
+
229
+ with gr.Accordion("Instructions & Calibration", open=False):
230
+ gr.HTML(GUIDE_HTML)
231
+ gr.HTML(EXAMPLES_HTML)
232
+
233
+ with gr.Column(visible=True) as intro_box:
234
+ username_input = gr.Textbox(label="Enter Your Name/ID", placeholder="e.g., user_101")
235
+ btn_start = gr.Button("Start / Resume Annotation", variant="primary")
236
+
237
+ with gr.Column(visible=False) as task_box:
238
+ progress_label = gr.Label(label="Overall Progress")
239
+ doc_display = gr.Textbox(interactive=False, lines=12, label="Medical Text")
240
+ doc_slider = gr.Slider(1, 5, step=1, label="Difficulty (1=Easy, 5=Hard)", value=3)
241
+
242
+ with gr.Row():
243
+ btn_prev = gr.Button("⬅️ Previous", variant="secondary")
244
+ btn_submit = gr.Button("Submit & Next ➡️", variant="primary")
245
+
246
+ # --- EVENT HANDLERS ---
247
+ btn_start.click(
248
+ fn=start_session,
249
+ inputs=[username_input],
250
+ outputs=[intro_box, task_box, doc_display, progress_label, doc_slider, session_state]
251
+ )
252
+
253
+ btn_submit.click(
254
+ fn=submit_rating,
255
+ inputs=[doc_slider, session_state],
256
+ outputs=[doc_display, progress_label, doc_slider, session_state]
257
+ )
258
+
259
+ btn_prev.click(
260
+ fn=go_back,
261
+ inputs=[session_state],
262
+ outputs=[doc_display, progress_label, doc_slider, session_state]
263
+ )
264
+
265
+ if __name__ == "__main__":
266
+ demo.launch(share=True)
code/interface/annotators_v5_tran_quality.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import os
4
+ from datetime import datetime
5
+
6
+ # --- PATH CONFIGURATION ---
7
+ # DATA_PATH = "/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_with_gs_summary_en_0_20.json"
8
+ DATA_PATH = "/home/mshahidul/readctrl/data/data_annotator_data/syn_data_diff_labels_en_0_80.json"
9
+ SAVE_ROOT = "/home/mshahidul/readctrl/data/annotators_validate_data_(20_80)"
10
+ os.makedirs(SAVE_ROOT, exist_ok=True)
11
+
12
+ # --- UI HTML COMPONENTS (Kept same as original) ---
13
+ GUIDE_HTML = """
14
+ <div style="background-color: #f9f9f9; padding: 15px; border-left: 6px solid #4CAF50; border-radius: 4px; margin-bottom: 20px;">
15
+ <h3>Rating Guide: Medical Text Difficulty</h3>
16
+ <table style="width:100%; border-collapse: collapse; text-align: left;">
17
+ <tr style="background-color: #e8f5e9;">
18
+ <th style="padding: 8px; border: 1px solid #ddd;">Score</th>
19
+ <th style="padding: 8px; border: 1px solid #ddd;">Description</th>
20
+ </tr>
21
+ <tr><td><b>1</b></td><td><b>Very Easy:</b> Simple words, no medical jargon.</td></tr>
22
+ <tr><td><b>2</b></td><td><b>Easy:</b> Conversational medical terms.</td></tr>
23
+ <tr><td><b>3</b></td><td><b>Moderate:</b> Standard patient education material.</td></tr>
24
+ <tr><td><b>4</b></td><td><b>Hard:</b> Significant technical jargon.</td></tr>
25
+ <tr><td><b>5</b></td><td><b>Very Hard:</b> Specialist-level / Academic.</td></tr>
26
+ </table>
27
+ </div>
28
+ """
29
+
30
+ EXAMPLES_HTML = """
31
+ <div style="background-color: #ffffff; padding: 15px; border: 1px solid #ddd; border-radius: 4px;">
32
+ <h3 style="color: #2e7d32;">Reference Examples</h3>
33
+ <div style="display: flex; gap: 15px;">
34
+ <div style="flex: 1; background-color: #f1f8e9; padding: 10px; border-radius: 4px;">
35
+ <h4>Level 1-2</h4>
36
+ <p>"She had a kidney problem... a big blood clot blocked veins in her brain."</p>
37
+ </div>
38
+ <div style="flex: 1; background-color: #ffebee; padding: 10px; border-radius: 4px;">
39
+ <h4>Level 4-5</h4>
40
+ <p>"Idiopathic NS inaugurated by cerebral venous thrombosis extended to the right jugular vein."</p>
41
+ </div>
42
+ </div>
43
+ </div>
44
+ """
45
+
46
+ # --- DATA LOADING ---
47
+ if os.path.exists(DATA_PATH):
48
+ with open(DATA_PATH, "r") as f:
49
+ FULL_DATASET = json.load(f)
50
+ FULL_DATASET=FULL_DATASET[60:]
51
+ else:
52
+ assert False, f"Data file not found at {DATA_PATH}"
53
+
54
+ # --- PERSISTENCE HELPERS ---
55
+ def get_user_dir(username):
56
+ clean_username = "".join([c for c in username if c.isalnum() or c in (' ', '_', '-')]).strip() or "anonymous"
57
+ return os.path.join(SAVE_ROOT, clean_username)
58
+
59
+ def save_state(user_dir, state_dict):
60
+ with open(os.path.join(user_dir, "state.json"), "w") as f:
61
+ json.dump(state_dict, f, indent=4)
62
+
63
+ def load_state(user_dir):
64
+ state_path = os.path.join(user_dir, "state.json")
65
+ if os.path.exists(state_path):
66
+ with open(state_path, "r") as f:
67
+ return json.load(f)
68
+ return None
69
+
70
+ # --- LOGIC FUNCTIONS ---
71
+ def get_current_ui_values(state):
72
+ """Helper to get UI values for the current index, including previous ratings if they exist."""
73
+ idx = state['current_index']
74
+ current_item = state['queue'][idx]
75
+
76
+ # Check if we already have a rating for this specific index
77
+ existing_rating = 3 # Default
78
+ for res in state['results']:
79
+ if res['queue_position'] == idx:
80
+ existing_rating = res['rating']
81
+ break
82
+
83
+ progress = f"Item {idx + 1} of {len(state['queue'])}"
84
+ return current_item['generated_summary'], progress, existing_rating
85
+
86
+ def start_session(username):
87
+ if not username:
88
+ gr.Warning("Please enter a username!")
89
+ return [gr.update()] * 5
90
+
91
+ user_dir = get_user_dir(username)
92
+ os.makedirs(user_dir, exist_ok=True)
93
+ existing_state = load_state(user_dir)
94
+
95
+ if existing_state:
96
+ gr.Info(f"Welcome back! Resuming from item {existing_state['current_index'] + 1}.")
97
+ state = existing_state
98
+ else:
99
+ state = {
100
+ "username": username,
101
+ "current_index": 0,
102
+ "queue": list(FULL_DATASET),
103
+ "results": [],
104
+ "completed": False
105
+ }
106
+ save_state(user_dir, state)
107
+
108
+ text, progress, rating = get_current_ui_values(state)
109
+ return (gr.update(visible=False), gr.update(visible=True), text, progress, rating, state)
110
+
111
+ def submit_rating(doc_slider, state):
112
+ if state is None: return "", "Error", 3, 3, None
113
+
114
+ user_dir = get_user_dir(state['username'])
115
+ idx = state['current_index']
116
+ current_item = state['queue'][idx]
117
+
118
+ # Update existing rating if editing, otherwise append
119
+ new_result = {
120
+ "queue_position": idx,
121
+ "doc_id": current_item.get('index', 'no_id'),
122
+ "label": current_item.get('label', 'no_label'),
123
+ "rating": doc_slider,
124
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
125
+ }
126
+
127
+ # Logic to overwrite existing rating for this index
128
+ state['results'] = [r for r in state['results'] if r['queue_position'] != idx]
129
+ state['results'].append(new_result)
130
+ state['results'].sort(key=lambda x: x['queue_position']) # Keep sorted
131
+
132
+ if idx + 1 < len(state['queue']):
133
+ state['current_index'] += 1
134
+ save_state(user_dir, state)
135
+ # Save results file
136
+ with open(os.path.join(user_dir, "annotation_results.json"), "w") as f:
137
+ json.dump(state['results'], f, indent=4)
138
+
139
+ text, progress, rating = get_current_ui_values(state)
140
+ return text, progress, rating, state
141
+ else:
142
+ state['completed'] = True
143
+ save_state(user_dir, state)
144
+ return "✅ ALL TASKS COMPLETED", "Status: Finished", 1, state
145
+
146
+ def go_back(state):
147
+ if state is None or state['current_index'] <= 0:
148
+ gr.Warning("Already at the first item.")
149
+ return [gr.update()] * 3 + [state]
150
+
151
+ state['current_index'] -= 1
152
+ text, progress, rating = get_current_ui_values(state)
153
+ return text, progress, rating, state
154
+
155
+ # --- UI INTERFACE ---
156
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
157
+ session_state = gr.State()
158
+
159
+ gr.Markdown("# Medical Text Readability Annotation")
160
+
161
+ with gr.Accordion("Instructions & Calibration", open=False):
162
+ gr.HTML(GUIDE_HTML)
163
+ gr.HTML(EXAMPLES_HTML)
164
+
165
+ with gr.Column(visible=True) as intro_box:
166
+ username_input = gr.Textbox(label="Enter Your Name/ID", placeholder="e.g., user_101")
167
+ btn_start = gr.Button("Start / Resume Annotation", variant="primary")
168
+
169
+ with gr.Column(visible=False) as task_box:
170
+ progress_label = gr.Label(label="Overall Progress")
171
+ doc_display = gr.Textbox(interactive=False, lines=12, label="Medical Text")
172
+ doc_slider = gr.Slider(1, 5, step=1, label="Difficulty (1=Easy, 5=Hard)", value=3)
173
+
174
+ with gr.Row():
175
+ btn_prev = gr.Button("⬅️ Previous", variant="secondary")
176
+ btn_submit = gr.Button("Submit & Next ➡️", variant="primary")
177
+
178
+ # --- EVENT HANDLERS ---
179
+ btn_start.click(
180
+ fn=start_session,
181
+ inputs=[username_input],
182
+ outputs=[intro_box, task_box, doc_display, progress_label, doc_slider, session_state]
183
+ )
184
+
185
+ btn_submit.click(
186
+ fn=submit_rating,
187
+ inputs=[doc_slider, session_state],
188
+ outputs=[doc_display, progress_label, doc_slider, session_state]
189
+ )
190
+
191
+ btn_prev.click(
192
+ fn=go_back,
193
+ inputs=[session_state],
194
+ outputs=[doc_display, progress_label, doc_slider, session_state]
195
+ )
196
+
197
+ if __name__ == "__main__":
198
+ demo.launch(share=True)
code/interface/instr ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # gr.Markdown("# 🏥 Health Literacy Subclaim Annotation\n## Texts labeled as low health literacy include less information than those labeled as intermediate health literacy, and intermediate health literacy texts include less information than proficient health literacy texts.\nSome key information has already been pre-selected to ensure that each label contains a minimum required amount of information. If you believe additional information should be included for a given label, please select the corresponding checkboxes.")
4
+ # with gr.Accordion("📖 Read Instructions First", open=True):
5
+ # gr.Markdown("""
6
+
7
+ # ### Step 1: Read the Text Type
8
+
9
+ # You will see **one text at a time**. At the top, the interface will tell you whether this is:
10
+
11
+ # * **Full Text**, or
12
+ # * **Gold Summary**
13
+
14
+ # Please read the text carefully before selecting any subclaims.
15
+
16
+ # ---
17
+
18
+ # ### Step 2: Review the Subclaims
19
+
20
+ # Below the text, you will see a list of **subclaims**.
21
+ # Each subclaim represents one piece of information from the text.
22
+
23
+ # **Example subclaims:**
24
+
25
+ # * ☐ The patient has high blood pressure.
26
+ # * ☐ The patient is 62 years old.
27
+ # * ☐ The patient experiences chest pain when breathing.
28
+ # * ☐ A chest X-ray shows pneumonia in the right lung.
29
+ # * ☐ The COVID test result is negative.
30
+
31
+ # ---
32
+
33
+ # ### Step 3: Annotate for Each Health Literacy Label
34
+
35
+ # You must select subclaims **separately for each label**.
36
+
37
+ # #### Low Health Literacy
38
+
39
+ # Select **only the most essential information** needed for basic understanding.
40
+
41
+ # **Good selection example:**
42
+
43
+ # * ☑ The patient has high blood pressure.
44
+ # * ☑ A chest X-ray shows pneumonia in the right lung.
45
+
46
+ # **Do NOT include:**
47
+
48
+ # * Exact age
49
+ # * Test details unless critical
50
+ # * Extra clinical findings
51
+
52
+ # ➡ Coverage should be **lowest**.
53
+
54
+ # ---
55
+
56
+ # #### Intermediate Health Literacy
57
+
58
+ # Select the **core information plus some helpful details**.
59
+
60
+ # **Good selection example:**
61
+
62
+ # * ☑ The patient has high blood pressure.
63
+ # * ☑ The patient experiences chest pain when breathing.
64
+ # * ☑ A chest X-ray shows pneumonia in the right lung.
65
+ # * ☑ The COVID test result is negative.
66
+
67
+ # ➡ Coverage should be **more than low**, but **less than proficient**.
68
+
69
+ # ---
70
+
71
+ # #### Proficient Health Literacy
72
+
73
+ # Select **all clinically relevant information**.
74
+
75
+ # **Good selection example:**
76
+
77
+ # * ☑ The patient has high blood pressure.
78
+ # * ☑ The patient is 62 years old.
79
+ # * ☑ The patient experiences chest pain when breathing.
80
+ # * ☑ A chest X-ray shows pneumonia in the right lung.
81
+ # * ☑ The COVID test result is negative.
82
+
83
+ # ➡ Coverage should be **highest**.
84
+
85
+ # ---
86
+
87
+ # ### Step 4: Check Information Percentages
88
+
89
+ # The interface shows the **percentage of selected information** for each label.
90
+
91
+ # A correct annotation should follow this order:
92
+
93
+ # > **Low % < Intermediate % < Proficient %**
94
+
95
+ # ⚠️ If low health literacy has more information than intermediate or proficient, you will see a warning. Please revise your selections.
96
+
97
+ # ---
98
+
99
+ # ### Key Reminder
100
+
101
+ # * Some subclaims may already be pre-selected to ensure **minimum required information**.
102
+ # * Only add new subclaims if you believe they are appropriate for that label.
103
+ # * When finished, submit and proceed to the **next instance**.
104
+
105
+
106
+
107
+ # """)
code/interface/instructions ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 📖 Annotation Guide: Health Literacy
2
+ Welcome! Your task is to determine which pieces of information (subclaims) belong in different versions of a health text based on **Health Literacy levels**.
3
+ ## * **Pre-selections:** Some boxes are checked by default—these are the "minimum required" facts.
4
+ ## Sometimes, generated summaries with different labels contain all the information present in the gold summary.
5
+ ## In the case of full text, the amount of information included depends on the readability level. Texts with a low readability label contain less information than those with a proficient readability label.
6
+ ## Consistency: Any information listed under 'Low' should automatically also appear under 'Intermediate' and 'Proficient.
7
+ ---
8
+
9
+ ### 🟢 Step 1: Identify the Source
10
+ Check the top of the interface. You are working with either:
11
+ * **Full Text:** The original clinical document.
12
+ * **Gold Summary:** A condensed version of the facts.
13
+
14
+ ### 🔍 Step 2: Review the Subclaims
15
+ Subclaims are individual facts extracted from the text.
16
+ > *Example: "The patient is 62 years old" or "The X-ray shows pneumonia."*
17
+
18
+ ---
19
+
20
+ ### ⚖️ Step 3: Annotate by Literacy Level
21
+ You must select checkboxes for **three different audiences**. The goal is to create a "ladder" of information:
22
+
23
+ | Level | Goal | Inclusion Strategy |
24
+ | :--- | :--- | :--- |
25
+ | **🟢 Low** | **Basic Survival** | Only the absolute essentials. What must they know to stay safe? |
26
+ | **🔵 Intermediate** | **Clear Context** | Core info + helpful context. Explain the "what" and "why." |
27
+ | **🟣 Proficient** | **Full Detail** | Everything. Include clinical findings, ages, and specific test data. |
28
+
29
+ ---
30
+
31
+ ### 📊 Step 4: The Golden Rule (Check Your Percentages)
32
+ To ensure high-quality data, your selections **must** follow this hierarchy:
33
+ # **Low % < Intermediate % < Proficient %**
34
+
35
+ ⚠️ **Wait for the Green Light:** If the **Low** level contains more information than **Intermediate**, the system will show a warning. Adjust your checkboxes until the percentages flow from lowest to highest.
36
+
37
+ ---
38
+
39
+ ### 💡 Quick Tips
40
+
41
+ * **Clinical Relevance:** For **Proficient**, include specific numbers (e.g., "140/90 mmHg") that might be too technical for **Low**.
42
+
43
+ **Ready to start?** Scroll down to begin your first annotation.
code/interface/interface_correction_data.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import os
4
+ from openai import OpenAI
5
+
6
+ # --- CONFIGURATION ---
7
+ DATA_PATH = '/home/mshahidul/readctrl/data/annotators_validate_data_(20_80)/code/correction_evaluation_full_text_with_gs.json'
8
+ SAVE_DIR = '/home/mshahidul/readctrl/data/annotators_validate_data_(20_80)/correction_data/'
9
+ PROMPT_TEMPLATE_PATH = "/home/mshahidul/readctrl/prompts/syn_data_gen_diff_label_mod.txt"
10
+ API_FILE_PATH = "/home/mshahidul/api_new.json"
11
+
12
+ # --- INITIALIZATION ---
13
+ # Load API Key
14
+ with open(API_FILE_PATH, "r") as f:
15
+ api_keys = json.load(f)
16
+ client = OpenAI(api_key=api_keys["openai"])
17
+
18
+ # Load Prompt Template
19
+ with open(PROMPT_TEMPLATE_PATH, "r") as f:
20
+ PROMPT_TEMPLATE = f.read()
21
+
22
+ def load_data():
23
+ if os.path.exists(DATA_PATH):
24
+ with open(DATA_PATH, 'r') as f:
25
+ return json.load(f)
26
+ return []
27
+
28
+ DATA = load_data()
29
+
30
+ # --- AI LOGIC ---
31
+ def call_ai_processor(index, full_text, gold_summary):
32
+ """Calls GPT-5 (OpenAI API) and extracts the text for the current label."""
33
+ try:
34
+ item = DATA[index]
35
+ target_label = item.get('ai_label') # e.g., "low_health_literacy"
36
+
37
+ # Note: 'source_language' should ideally be in your JSON.
38
+ # Defaulting to English if not found.
39
+ source_lang = item.get('language', 'English')
40
+
41
+ # Format the prompt
42
+ prompt = (PROMPT_TEMPLATE
43
+ .replace("<<<FULL_TEXT>>>", full_text)
44
+ .replace("<<<SOURCE_LANGUAGE>>>", source_lang)
45
+ .replace("<<<GOLD_SUMMARY>>>", gold_summary)
46
+ .replace("<<<TARGET_LABEL>>>", target_label))
47
+ # import ipdb; ipdb.set_trace()
48
+
49
+ response = client.chat.completions.create(
50
+ model="gpt-5-mini", # Change to "gpt-5" or specific model name when available
51
+ messages=[{"role": "user", "content": prompt}],
52
+ response_format={ "type": "json_object" }
53
+ )
54
+
55
+ content = json.loads(response.choices[0].message.content)
56
+
57
+ # Extract only the text for the specific label we are currently editing
58
+ # target_label usually matches the keys: low_health_literacy, etc.
59
+ refined_text = content.get(target_label, "Error: Label not found in AI response.")
60
+ return refined_text
61
+
62
+ except Exception as e:
63
+ return f"AI Error: {str(e)}"
64
+
65
+ # --- DATA HELPERS ---
66
+ def get_user_save_path(username):
67
+ clean_name = "".join([c for c in username if c.isalpha() or c.isdigit()]).rstrip()
68
+ return os.path.join(SAVE_DIR, f"final_corrected_{clean_name}.json")
69
+
70
+ def load_user_results(username):
71
+ path = get_user_save_path(username)
72
+ if os.path.exists(path):
73
+ with open(path, 'r') as f:
74
+ return json.load(f)
75
+ return []
76
+
77
+ def get_record(index):
78
+ if 0 <= index < len(DATA):
79
+ item = DATA[index]
80
+ ai_label = item.get('ai_label', '')
81
+ ai_text = item.get('diff_label_texts', {}).get(ai_label, "Text not found")
82
+ gold_summary = item.get('summary', '') # Added this for the AI prompt
83
+
84
+ anno_info = (
85
+ f"Plaban: {item.get('category_plaban')} (Rating: {item.get('rating_plaban')})\n"
86
+ f"Mahi: {item.get('category_mahi')} (Rating: {item.get('rating_mahi')})\n"
87
+ f"Shama: {item.get('category_shama')} (Rating: {item.get('rating_shama')})"
88
+ )
89
+
90
+ return (
91
+ item.get('doc_id'),
92
+ anno_info,
93
+ ai_label.replace("_", " ").title(),
94
+ item.get('fulltext'),
95
+ ai_text,
96
+ index,
97
+ gold_summary
98
+ )
99
+ return None
100
+
101
+ def login_user(username):
102
+ if not username or len(username.strip()) == 0:
103
+ return gr.update(visible=True), gr.update(visible=False), 0, None, "", "", "", "", ""
104
+
105
+ existing_data = load_user_results(username)
106
+ start_index = len(existing_data)
107
+
108
+ if start_index >= len(DATA):
109
+ return gr.update(visible=False), gr.update(visible=True), start_index, "Finished!", "All caught up!", "No more data.", "No more data.", "", ""
110
+
111
+ record = get_record(start_index)
112
+ return (
113
+ gr.update(visible=False),
114
+ gr.update(visible=True),
115
+ start_index,
116
+ record[0], record[1], record[2], record[3], record[4], record[6]
117
+ )
118
+
119
+ def save_and_next(username, index, corrected_text, is_ok):
120
+ user_results = load_user_results(username)
121
+ current_item = DATA[index]
122
+
123
+ # If the user didn't type anything in manual_correction and hit "AI Text is OK", use original
124
+ final_text = current_item.get('diff_label_texts', {}).get(current_item['ai_label']) if is_ok else corrected_text
125
+
126
+ result_entry = {
127
+ "doc_id": current_item['doc_id'],
128
+ "ai_label": current_item['ai_label'],
129
+ "status": "Approved" if is_ok else "Manually Corrected/AI Refined",
130
+ "final_text": final_text,
131
+ "original_ai_text": current_item.get('diff_label_texts', {}).get(current_item['ai_label'])
132
+ }
133
+
134
+ user_results.append(result_entry)
135
+
136
+ with open(get_user_save_path(username), 'w') as f:
137
+ json.dump(user_results, f, indent=4)
138
+
139
+ next_index = index + 1
140
+ if next_index < len(DATA):
141
+ res = get_record(next_index)
142
+ return list(res) + [""]
143
+ else:
144
+ return [None, "Finished!", "Finished!", "No more data.", "No more data.", next_index, "No more data.", ""]
145
+
146
+ # --- GRADIO UI ---
147
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
148
+ gr.Markdown("# 📝 AI Label Correction Interface (v2 with GPT-Refinement)")
149
+
150
+ current_idx = gr.State(0)
151
+ user_session = gr.State("")
152
+ gold_summary_hidden = gr.State("") # To hold the summary for the AI prompt
153
+
154
+ with gr.Row() as login_row:
155
+ with gr.Column(scale=1):
156
+ user_input = gr.Textbox(label="Enter Username to Resume", placeholder="e.g., Shahidul")
157
+ btn_login = gr.Button("Start Annotation", variant="primary")
158
+
159
+ with gr.Column(visible=False) as main_container:
160
+ with gr.Row():
161
+ with gr.Column(scale=1):
162
+ doc_id_display = gr.Textbox(label="Document ID", interactive=False)
163
+ ai_label_display = gr.Label(label="Target AI Label")
164
+ annotator_stats = gr.Textbox(label="Human Annotator Ratings", lines=4, interactive=False)
165
+
166
+ with gr.Column(scale=2):
167
+ full_text_display = gr.Textbox(label="Source Full Text", lines=10, interactive=False)
168
+
169
+ with gr.Row():
170
+ with gr.Column():
171
+ ai_generated_text = gr.Textbox(label="Original AI Text", lines=6, interactive=False)
172
+ with gr.Column():
173
+ manual_correction = gr.Textbox(label="AI Refinement / Manual Correction", placeholder="AI generated text will appear here...", lines=6)
174
+ btn_ai_check = gr.Button("✨ Check & Refine through AI", variant="secondary")
175
+
176
+ with gr.Row():
177
+ btn_ok = gr.Button("✅ Original Text is OK", variant="primary")
178
+ btn_fix = gr.Button("💾 Save Current Correction/AI Text", variant="stop")
179
+
180
+ # --- LOGIC ---
181
+ btn_login.click(
182
+ fn=login_user,
183
+ inputs=[user_input],
184
+ outputs=[login_row, main_container, current_idx, doc_id_display, annotator_stats, ai_label_display, full_text_display, ai_generated_text, gold_summary_hidden]
185
+ ).then(fn=lambda username: username, inputs=[user_input], outputs=[user_session])
186
+
187
+ # AI Regeneration Logic
188
+ btn_ai_check.click(
189
+ fn=call_ai_processor,
190
+ inputs=[current_idx, full_text_display, gold_summary_hidden],
191
+ outputs=[manual_correction]
192
+ )
193
+
194
+ action_inputs = [user_session, current_idx, manual_correction]
195
+ action_outputs = [doc_id_display, annotator_stats, ai_label_display, full_text_display, ai_generated_text, current_idx, gold_summary_hidden, manual_correction]
196
+
197
+ btn_ok.click(
198
+ fn=lambda user, idx, txt: save_and_next(user, idx, txt, True),
199
+ inputs=action_inputs,
200
+ outputs=action_outputs
201
+ )
202
+
203
+ btn_fix.click(
204
+ fn=lambda user, idx, txt: save_and_next(user, idx, txt, False),
205
+ inputs=action_inputs,
206
+ outputs=action_outputs
207
+ )
208
+
209
+ if __name__ == "__main__":
210
+ demo.launch(share=True)
code/interface/t.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from gradio_client import Client
2
+
3
+ client = Client("https://23833b5a465382100f.gradio.live/")
4
+ result = client.predict(
5
+ message="Hello!!",
6
+ api_name="/chat_predict"
7
+ )
8
+ print(result)
code/interface/translate_gemma.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from openai import OpenAI
3
+ import base64
4
+ import io
5
+
6
+ # Initialize the client pointing to your vLLM server
7
+ client = OpenAI(
8
+ base_url="http://172.16.34.29:8006/v1",
9
+ api_key="vllm-token",
10
+ )
11
+
12
+ def encode_image_to_base64(image):
13
+ """Converts PIL image to raw base64 string (no data-uri prefix)."""
14
+ if image is None:
15
+ return None
16
+ buffered = io.BytesIO()
17
+ image.save(buffered, format="JPEG")
18
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
19
+
20
+ def run_translation(source_code, target_code, text_input, image_input):
21
+ # Construct the base dictionary
22
+ # The schema requires all these keys to be present in the mapping
23
+ payload = {
24
+ "source_lang_code": source_code,
25
+ "target_lang_code": target_code,
26
+ "text": None,
27
+ "image": None
28
+ }
29
+
30
+ if image_input is not None:
31
+ payload["type"] = "image"
32
+ payload["image"] = encode_image_to_base64(image_input)
33
+ else:
34
+ if not text_input.strip():
35
+ return "Please provide text or an image."
36
+ payload["type"] = "text"
37
+ payload["text"] = text_input
38
+
39
+ try:
40
+ # Crucial: We pass the payload as the single item in the content list
41
+ response = client.chat.completions.create(
42
+ model="translate_gemma",
43
+ messages=[{
44
+ "role": "user",
45
+ "content": [payload] # vLLM expects exactly [ { ... } ]
46
+ }],
47
+ max_tokens=500
48
+ )
49
+ return response.choices[0].message.content
50
+ except Exception as e:
51
+ return f"⚠️ Error: {str(e)}"
52
+
53
+ # --- Gradio UI Layout ---
54
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
55
+ gr.Markdown("# 🌍 TranslateGemma 27B")
56
+ gr.Markdown("Corrected schema for vLLM inference.")
57
+
58
+ with gr.Row():
59
+ src_code = gr.Textbox(label="Source Language Code", value="en")
60
+ tgt_code = gr.Textbox(label="Target Language Code", value="bn")
61
+
62
+ with gr.Row():
63
+ with gr.Column():
64
+ text_box = gr.Textbox(label="Text Input", placeholder="Type English here...", lines=5)
65
+ image_box = gr.Image(label="Image Input", type="pil")
66
+ submit_btn = gr.Button("Translate", variant="primary")
67
+
68
+ with gr.Column():
69
+ output_box = gr.Textbox(label="Bangla Translation", interactive=False, lines=10)
70
+
71
+ submit_btn.click(
72
+ fn=run_translation,
73
+ inputs=[src_code, tgt_code, text_box, image_box],
74
+ outputs=output_box
75
+ )
76
+
77
+ if __name__ == "__main__":
78
+ demo.launch(share=True)
code/interface/translation_quality.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import os
4
+ from datetime import datetime
5
+
6
+
7
+ def sanitize_username(username: str) -> str:
8
+ """Make username safe for filesystem paths."""
9
+ if not username:
10
+ return ""
11
+ username = username.strip()
12
+ safe = "".join(ch for ch in username if ch.isalnum() or ch in ("_", "-"))
13
+ return safe
14
+
15
+ def get_user_session_file(username):
16
+ safe = sanitize_username(username)
17
+ return os.path.join(SAVE_DIR, f"ratings_{safe}.json")
18
+
19
+ language="Bengali"
20
+ if language=="Chinese":
21
+ language_code="ch"
22
+ elif language=="Hindi":
23
+ language_code="hi"
24
+ elif language=="Bengali":
25
+ language_code="be"
26
+ else:
27
+ assert False, "Unsupported language"
28
+
29
+
30
+ # Load translation dataset
31
+ TRANSLATION_PATH = f"/home/mshahidul/readctrl/data/translated_data/translation_english2bangla_v1.json"
32
+ with open(TRANSLATION_PATH, "r", encoding="utf-8") as f:
33
+ translation_dataset = json.load(f)[:50]
34
+
35
+ # Load source dataset for English fulltext
36
+ SRC_PATH = f"/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json"
37
+ with open(SRC_PATH, "r", encoding="utf-8") as f:
38
+ src_dataset = json.load(f)[:50]
39
+
40
+ # Merge datasets by index (assume same order)
41
+ dataset = [
42
+ {
43
+ "src_fulltext": src_dataset[i]["fulltext"],
44
+ "translated_fulltext": translation_dataset[i]["fulltext_translated"]["translated_medical_note"],
45
+ "id": translation_dataset[i]["id"]
46
+ }
47
+ for i in range(min(len(src_dataset), len(translation_dataset)))
48
+ ]
49
+
50
+ # 2. Configuration for saving
51
+ SAVE_DIR = f"/home/mshahidul/readctrl/data/translated_data/rating_info/{language_code}"
52
+ os.makedirs(SAVE_DIR, exist_ok=True)
53
+
54
+ SESSION_FILE = None # Will be set per user
55
+
56
+ RATING_OPTIONS = [
57
+ ("1 - Poor (Incorrect/Nonsense)", 1),
58
+ ("2 - Fair (Understandable but awkward)", 2),
59
+ ("3 - Good (Accurate/Perfect)", 3)
60
+ ]
61
+
62
+ custom_css = """
63
+ .small-header { font-size: 0.85rem !important; font-weight: 600; margin-bottom: -10px; color: #555; }
64
+ .nav-row { background-color: #f9f9f9; padding: 10px; border-radius: 8px; margin-bottom: 15px; }
65
+ """
66
+
67
+ def save_rating_to_json(data_item, username):
68
+ session_file = get_user_session_file(username)
69
+ output_data = []
70
+ if os.path.exists(session_file):
71
+ with open(session_file, "r", encoding="utf-8") as f:
72
+ try:
73
+ output_data = json.load(f)
74
+ except json.JSONDecodeError:
75
+ output_data = []
76
+
77
+ # Backward/forward compatibility: support either list[record] or dict with "records".
78
+ if isinstance(output_data, dict):
79
+ records = output_data.get("records", [])
80
+ else:
81
+ records = output_data if isinstance(output_data, list) else []
82
+
83
+ # Keep a single record per index (update if it already exists).
84
+ new_index = data_item.get("index")
85
+ updated = False
86
+ for i, rec in enumerate(records):
87
+ if isinstance(rec, dict) and rec.get("index") == new_index:
88
+ records[i] = data_item
89
+ updated = True
90
+ break
91
+ if not updated:
92
+ records.append(data_item)
93
+
94
+ payload = {
95
+ "username": sanitize_username(username) or username,
96
+ "updated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
97
+ "records": records,
98
+ }
99
+ with open(session_file, "w", encoding="utf-8") as f:
100
+ json.dump(payload, f, ensure_ascii=False, indent=4)
101
+
102
+
103
+ def load_user_records(username):
104
+ session_file = get_user_session_file(username)
105
+ if not os.path.exists(session_file):
106
+ return []
107
+ try:
108
+ with open(session_file, "r", encoding="utf-8") as f:
109
+ data = json.load(f)
110
+ if isinstance(data, dict):
111
+ records = data.get("records", [])
112
+ else:
113
+ records = data
114
+ return records if isinstance(records, list) else []
115
+ except Exception:
116
+ return []
117
+
118
+ def load_example(index):
119
+ total = len(dataset)
120
+ index = max(0, min(index, total - 1))
121
+ item = dataset[index]
122
+ progress_pct = (index / total) * 100
123
+ progress_text = f"Sample {index + 1} of {total} ({progress_pct:.1f}%)"
124
+ src_fulltext = item["src_fulltext"]
125
+ translated_fulltext = item["translated_fulltext"]
126
+ return (
127
+ src_fulltext, # src_display
128
+ translated_fulltext, # eng_display
129
+ None, # rating_dropdown (clears selection)
130
+ index, # current_index
131
+ progress_text, # progress_display
132
+ progress_pct, # progress_bar
133
+ index + 1 # jump_input
134
+ )
135
+
136
+ def get_last_index_for_user(username):
137
+ if not username:
138
+ return 0
139
+ records = load_user_records(username)
140
+ done_indices = set()
141
+ for rec in records:
142
+ if isinstance(rec, dict) and isinstance(rec.get("index"), int):
143
+ done_indices.add(rec["index"])
144
+
145
+ # Resume means: first unannotated sample in order.
146
+ for i in range(len(dataset)):
147
+ if i not in done_indices:
148
+ return i
149
+ # Completed.
150
+ return len(dataset)
151
+
152
+
153
+ def load_example_or_done(index):
154
+ if index >= len(dataset):
155
+ total = len(dataset)
156
+ progress_text = f"✅ Completed all {total} samples"
157
+ return (
158
+ "✅ ALL DONE",
159
+ "✅ ALL DONE",
160
+ None,
161
+ total,
162
+ progress_text,
163
+ 100,
164
+ total,
165
+ )
166
+ return load_example(index)
167
+
168
+ def next_item(index, rating, src_txt, eng_txt, username):
169
+ if rating is None:
170
+ raise gr.Error("Please select a rating before proceeding!")
171
+ if not username:
172
+ raise gr.Error("Please enter your username!")
173
+ safe_user = sanitize_username(username)
174
+ if not safe_user:
175
+ raise gr.Error("Username must contain letters/numbers (optionally _ or -).")
176
+ record = {
177
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
178
+ "index": index,
179
+ "src_text": src_txt,
180
+ "translated_text": eng_txt,
181
+ "rating": rating,
182
+ "username": safe_user
183
+ }
184
+ save_rating_to_json(record, safe_user)
185
+ gr.Info(f"Saved record {index + 1} for {safe_user}.")
186
+
187
+ # After saving, resume at first unannotated index.
188
+ next_idx = get_last_index_for_user(safe_user)
189
+ return load_example_or_done(next_idx)
190
+
191
+ def jump_to_instance(target_index):
192
+ return load_example_or_done(target_index - 1)
193
+
194
+ with gr.Blocks(css=custom_css) as demo:
195
+ username_box = gr.Textbox(label="Enter your username", value="", interactive=True)
196
+ login_btn = gr.Button("Start/Resume Session", variant="primary")
197
+ current_index = gr.State(0)
198
+ total_count = len(dataset)
199
+ gr.Markdown(f"### Translation Quality Annotation")
200
+ with gr.Row(elem_classes="nav-row"):
201
+ with gr.Column(scale=2):
202
+ progress_bar = gr.Slider(label="Progress", minimum=0, maximum=100, value=0, interactive=False)
203
+ progress_display = gr.Markdown(f"Sample 1 of {total_count} (0.0%)")
204
+ with gr.Column(scale=1):
205
+ jump_input = gr.Number(label="Jump to Sample #", value=1, precision=0)
206
+ jump_btn = gr.Button("Go", size="sm")
207
+ with gr.Row():
208
+ with gr.Column():
209
+ gr.Markdown("##### Source Fulltext (English)")
210
+ src_display = gr.Textbox(value=dataset[0]["src_fulltext"], interactive=False, lines=12, show_label=False)
211
+ with gr.Column():
212
+ gr.Markdown("##### Fulltext Translation (Bangla)")
213
+ eng_display = gr.Textbox(value=dataset[0]["translated_fulltext"], interactive=False, lines=12, show_label=False)
214
+ rating_dropdown = gr.Dropdown(choices=RATING_OPTIONS, label="Select Rating")
215
+ with gr.Row():
216
+ prev_btn = gr.Button("⬅ Previous (Review)", variant="secondary")
217
+ submit_btn = gr.Button("Save & Next ➡", variant="primary")
218
+
219
+ def login_user(username):
220
+ safe_user = sanitize_username(username)
221
+ if not safe_user:
222
+ raise gr.Error("Please enter a valid username (letters/numbers, _ or -).")
223
+ idx = get_last_index_for_user(safe_user)
224
+ return load_example_or_done(idx)
225
+
226
+ login_btn.click(
227
+ fn=login_user,
228
+ inputs=[username_box],
229
+ outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
230
+ )
231
+
232
+ submit_btn.click(
233
+ fn=next_item,
234
+ inputs=[current_index, rating_dropdown, src_display, eng_display, username_box],
235
+ outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
236
+ )
237
+
238
+ # 2. Update Prev Button: removed tr_display from outputs
239
+ prev_btn.click(
240
+ fn=lambda idx: load_example_or_done(idx - 1),
241
+ inputs=[current_index],
242
+ outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
243
+ )
244
+
245
+ # 3. Update Jump Button: removed tr_display from outputs
246
+ jump_btn.click(
247
+ fn=jump_to_instance,
248
+ inputs=[jump_input],
249
+ outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
250
+ )
251
+
252
+ if __name__ == "__main__":
253
+ demo.launch(share=True)
code/interface/translation_quality_v2.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import os
4
+ from datetime import datetime
5
+
6
+
7
+ def sanitize_username(username: str) -> str:
8
+ """Make username safe for filesystem paths."""
9
+ if not username:
10
+ return ""
11
+ username = username.strip()
12
+ safe = "".join(ch for ch in username if ch.isalnum() or ch in ("_", "-"))
13
+ return safe
14
+
15
+ def get_user_session_file(username):
16
+ safe = sanitize_username(username)
17
+ return os.path.join(SAVE_DIR, f"ratings_{safe}.json")
18
+
19
+ language="Bengali"
20
+ if language=="Chinese":
21
+ language_code="ch"
22
+ elif language=="Hindi":
23
+ language_code="hi"
24
+ elif language=="Bengali":
25
+ language_code="be"
26
+ else:
27
+ assert False, "Unsupported language"
28
+
29
+
30
+ # Load translation dataset (EN -> BN fulltext/summary)
31
+ TRANSLATION_PATH = (
32
+ "/home/mshahidul/readctrl/data/translated_data/translation_wo_judge/"
33
+ "multiclinsum_gs_train_en2bn_gemma(0_200).json"
34
+ )
35
+ with open(TRANSLATION_PATH, "r", encoding="utf-8") as f:
36
+ translation_dataset = json.load(f)
37
+
38
+ dataset = [
39
+ {
40
+ "src_fulltext": item.get("fulltext", ""),
41
+ "translated_fulltext": item.get("translated_fulltext", ""),
42
+ "id": item.get("id"),
43
+ }
44
+ for item in translation_dataset
45
+ ][:50]
46
+
47
+ # 2. Configuration for saving
48
+ SAVE_DIR = f"/home/mshahidul/readctrl/data/translated_data/rating_info_v2/{language_code}"
49
+ os.makedirs(SAVE_DIR, exist_ok=True)
50
+
51
+ SESSION_FILE = None # Will be set per user
52
+
53
+ RATING_OPTIONS = [
54
+ ("1 - Poor (Incorrect/Nonsense)", 1),
55
+ ("2 - Fair (Understandable but awkward)", 2),
56
+ ("3 - Good (Accurate/Perfect)", 3)
57
+ ]
58
+
59
+ custom_css = """
60
+ .small-header { font-size: 0.85rem !important; font-weight: 600; margin-bottom: -10px; color: #555; }
61
+ .nav-row { background-color: #f9f9f9; padding: 10px; border-radius: 8px; margin-bottom: 15px; }
62
+ """
63
+
64
+ def save_rating_to_json(data_item, username):
65
+ session_file = get_user_session_file(username)
66
+ output_data = []
67
+ if os.path.exists(session_file):
68
+ with open(session_file, "r", encoding="utf-8") as f:
69
+ try:
70
+ output_data = json.load(f)
71
+ except json.JSONDecodeError:
72
+ output_data = []
73
+
74
+ # Backward/forward compatibility: support either list[record] or dict with "records".
75
+ if isinstance(output_data, dict):
76
+ records = output_data.get("records", [])
77
+ else:
78
+ records = output_data if isinstance(output_data, list) else []
79
+
80
+ # Keep a single record per index (update if it already exists).
81
+ new_index = data_item.get("index")
82
+ updated = False
83
+ for i, rec in enumerate(records):
84
+ if isinstance(rec, dict) and rec.get("index") == new_index:
85
+ records[i] = data_item
86
+ updated = True
87
+ break
88
+ if not updated:
89
+ records.append(data_item)
90
+
91
+ payload = {
92
+ "username": sanitize_username(username) or username,
93
+ "updated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
94
+ "records": records,
95
+ }
96
+ with open(session_file, "w", encoding="utf-8") as f:
97
+ json.dump(payload, f, ensure_ascii=False, indent=4)
98
+
99
+
100
+ def load_user_records(username):
101
+ session_file = get_user_session_file(username)
102
+ if not os.path.exists(session_file):
103
+ return []
104
+ try:
105
+ with open(session_file, "r", encoding="utf-8") as f:
106
+ data = json.load(f)
107
+ if isinstance(data, dict):
108
+ records = data.get("records", [])
109
+ else:
110
+ records = data
111
+ return records if isinstance(records, list) else []
112
+ except Exception:
113
+ return []
114
+
115
+ def load_example(index):
116
+ total = len(dataset)
117
+ index = max(0, min(index, total - 1))
118
+ item = dataset[index]
119
+ progress_pct = (index / total) * 100
120
+ progress_text = f"Sample {index + 1} of {total} ({progress_pct:.1f}%)"
121
+ src_fulltext = item["src_fulltext"]
122
+ translated_fulltext = item["translated_fulltext"]
123
+ return (
124
+ src_fulltext, # src_display
125
+ translated_fulltext, # eng_display
126
+ None, # rating_dropdown (clears selection)
127
+ index, # current_index
128
+ progress_text, # progress_display
129
+ progress_pct, # progress_bar
130
+ index + 1 # jump_input
131
+ )
132
+
133
+ def get_last_index_for_user(username):
134
+ if not username:
135
+ return 0
136
+ records = load_user_records(username)
137
+ done_indices = set()
138
+ for rec in records:
139
+ if isinstance(rec, dict) and isinstance(rec.get("index"), int):
140
+ done_indices.add(rec["index"])
141
+
142
+ # Resume means: first unannotated sample in order.
143
+ for i in range(len(dataset)):
144
+ if i not in done_indices:
145
+ return i
146
+ # Completed.
147
+ return len(dataset)
148
+
149
+
150
+ def load_example_or_done(index):
151
+ if index >= len(dataset):
152
+ total = len(dataset)
153
+ progress_text = f"✅ Completed all {total} samples"
154
+ return (
155
+ "✅ ALL DONE",
156
+ "✅ ALL DONE",
157
+ None,
158
+ total,
159
+ progress_text,
160
+ 100,
161
+ total,
162
+ )
163
+ return load_example(index)
164
+
165
+ def next_item(index, rating, src_txt, eng_txt, username):
166
+ if rating is None:
167
+ raise gr.Error("Please select a rating before proceeding!")
168
+ if not username:
169
+ raise gr.Error("Please enter your username!")
170
+ safe_user = sanitize_username(username)
171
+ if not safe_user:
172
+ raise gr.Error("Username must contain letters/numbers (optionally _ or -).")
173
+ record = {
174
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
175
+ "index": index,
176
+ "src_text": src_txt,
177
+ "translated_text": eng_txt,
178
+ "rating": rating,
179
+ "username": safe_user
180
+ }
181
+ save_rating_to_json(record, safe_user)
182
+ gr.Info(f"Saved record {index + 1} for {safe_user}.")
183
+
184
+ # After saving, resume at first unannotated index.
185
+ next_idx = get_last_index_for_user(safe_user)
186
+ return load_example_or_done(next_idx)
187
+
188
+ def jump_to_instance(target_index):
189
+ return load_example_or_done(target_index - 1)
190
+
191
+ with gr.Blocks(css=custom_css) as demo:
192
+ username_box = gr.Textbox(label="Enter your username", value="", interactive=True)
193
+ login_btn = gr.Button("Start/Resume Session", variant="primary")
194
+ current_index = gr.State(0)
195
+ total_count = len(dataset)
196
+ gr.Markdown("## Translation Quality Annotation")
197
+ gr.Markdown("Data generated by TranslateGemma.")
198
+ with gr.Row(elem_classes="nav-row"):
199
+ with gr.Column(scale=2):
200
+ progress_bar = gr.Slider(label="Progress", minimum=0, maximum=100, value=0, interactive=False)
201
+ progress_display = gr.Markdown(f"Sample 1 of {total_count} (0.0%)")
202
+ with gr.Column(scale=1):
203
+ jump_input = gr.Number(label="Jump to Sample #", value=1, precision=0)
204
+ jump_btn = gr.Button("Go", size="sm")
205
+ with gr.Row():
206
+ with gr.Column():
207
+ gr.Markdown("##### Source Fulltext (English)")
208
+ src_display = gr.Textbox(value=dataset[0]["src_fulltext"], interactive=False, lines=12, show_label=False)
209
+ with gr.Column():
210
+ gr.Markdown("##### Fulltext Translation (Bangla)")
211
+ eng_display = gr.Textbox(value=dataset[0]["translated_fulltext"], interactive=False, lines=12, show_label=False)
212
+ rating_dropdown = gr.Dropdown(choices=RATING_OPTIONS, label="Select Rating")
213
+ with gr.Row():
214
+ prev_btn = gr.Button("⬅ Previous (Review)", variant="secondary")
215
+ submit_btn = gr.Button("Save & Next ➡", variant="primary")
216
+
217
+ def login_user(username):
218
+ safe_user = sanitize_username(username)
219
+ if not safe_user:
220
+ raise gr.Error("Please enter a valid username (letters/numbers, _ or -).")
221
+ idx = get_last_index_for_user(safe_user)
222
+ return load_example_or_done(idx)
223
+
224
+ login_btn.click(
225
+ fn=login_user,
226
+ inputs=[username_box],
227
+ outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
228
+ )
229
+
230
+ submit_btn.click(
231
+ fn=next_item,
232
+ inputs=[current_index, rating_dropdown, src_display, eng_display, username_box],
233
+ outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
234
+ )
235
+
236
+ # 2. Update Prev Button: removed tr_display from outputs
237
+ prev_btn.click(
238
+ fn=lambda idx: load_example_or_done(idx - 1),
239
+ inputs=[current_index],
240
+ outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
241
+ )
242
+
243
+ # 3. Update Jump Button: removed tr_display from outputs
244
+ jump_btn.click(
245
+ fn=jump_to_instance,
246
+ inputs=[jump_input],
247
+ outputs=[src_display, eng_display, rating_dropdown, current_index, progress_display, progress_bar, jump_input]
248
+ )
249
+
250
+ if __name__ == "__main__":
251
+ demo.launch(share=True)
code/interface/vllm_app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from openai import OpenAI
3
+
4
+ # Initialize the client
5
+ client = OpenAI(
6
+ base_url="http://localhost:8004/v1",
7
+ api_key="token-not-needed",
8
+ )
9
+
10
+ def predict(message, history):
11
+ history_openai_format = []
12
+
13
+ # Manually build the history to ensure it's clean
14
+ for pair in history:
15
+ # pair[0] is User, pair[1] is Assistant
16
+ if len(pair) >= 2:
17
+ history_openai_format.append({"role": "user", "content": str(pair[0])})
18
+ history_openai_format.append({"role": "assistant", "content": str(pair[1])})
19
+
20
+ # Add the current message
21
+ history_openai_format.append({"role": "user", "content": message})
22
+
23
+ # Create the completion request
24
+ response = client.chat.completions.create(
25
+ model="Qwen/Qwen3-30B-A3B-Instruct-2507",
26
+ messages=history_openai_format,
27
+ temperature=0.7,
28
+ stream=True
29
+ )
30
+
31
+ partial_message = ""
32
+ for chunk in response:
33
+ if chunk.choices[0].delta.content is not None:
34
+ partial_message += chunk.choices[0].delta.content
35
+ yield partial_message
36
+
37
+ # Launch the Gradio ChatInterface without the 'type' argument
38
+ demo = gr.ChatInterface(
39
+ fn=predict,
40
+ title="Qwen3 vLLM Chat",
41
+ description="Interface for Qwen/Qwen3-30B-A3B-Instruct-2507 running on vLLM",
42
+ examples=["What is the capital of France?", "Write a Python function for quicksort."]
43
+ )
44
+
45
+ if __name__ == "__main__":
46
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
code/interface/vllm_app_v2.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+ """Example for starting a Gradio OpenAI Chatbot Webserver
4
+ Start vLLM API server:
5
+ vllm serve meta-llama/Llama-2-7b-chat-hf
6
+
7
+ Start Gradio OpenAI Chatbot Webserver:
8
+ python /home/mshahidul/readctrl/code/interface/vllm_app_v2.py \
9
+ -m Qwen/Qwen3-30B-A3B-Instruct-2507 --model-url http://172.16.34.29:8004/v1
10
+
11
+ Note that `pip install --upgrade gradio` is needed to run this example.
12
+ More details: https://github.com/gradio-app/gradio
13
+
14
+ If your antivirus software blocks the download of frpc for gradio,
15
+ you can install it manually by following these steps:
16
+
17
+ 1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64
18
+ 2. Rename the downloaded file to: frpc_linux_amd64_v0.3
19
+ 3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
20
+ """
21
+
22
+ import argparse
23
+
24
+ import gradio as gr
25
+ from openai import OpenAI
26
+
27
+
28
+ def predict(message, history, client, model_name, temp, stop_token_ids):
29
+ messages = [
30
+ {"role": "system", "content": "You are a great AI assistant."},
31
+ *history,
32
+ {"role": "user", "content": message},
33
+ ]
34
+
35
+ # Send request to OpenAI API (vLLM server)
36
+ stream = client.chat.completions.create(
37
+ model=model_name,
38
+ messages=messages,
39
+ temperature=temp,
40
+ stream=True,
41
+ extra_body={
42
+ "repetition_penalty": 1,
43
+ "stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")]
44
+ if stop_token_ids
45
+ else [],
46
+ },
47
+ )
48
+
49
+ # Collect all chunks and concatenate them into a full message
50
+ full_message = ""
51
+ for chunk in stream:
52
+ full_message += chunk.choices[0].delta.content or ""
53
+
54
+ # Return the full message as a single response
55
+ return full_message
56
+
57
+
58
+ def parse_args():
59
+ parser = argparse.ArgumentParser(
60
+ description="Chatbot Interface with Customizable Parameters"
61
+ )
62
+ parser.add_argument(
63
+ "--model-url", type=str, default="http://localhost:8000/v1", help="Model URL"
64
+ )
65
+ parser.add_argument(
66
+ "-m", "--model", type=str, required=True, help="Model name for the chatbot"
67
+ )
68
+ parser.add_argument(
69
+ "--temp", type=float, default=0.8, help="Temperature for text generation"
70
+ )
71
+ parser.add_argument(
72
+ "--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs"
73
+ )
74
+ parser.add_argument("--host", type=str, default=None)
75
+ parser.add_argument("--port", type=int, default=8001)
76
+ return parser.parse_args()
77
+
78
+
79
+ def build_gradio_interface(client, model_name, temp, stop_token_ids):
80
+ def chat_predict(message, history):
81
+ return predict(message, history, client, model_name, temp, stop_token_ids)
82
+
83
+ return gr.ChatInterface(
84
+ fn=chat_predict,
85
+ title="Chatbot Interface",
86
+ description="A simple chatbot powered by vLLM",
87
+ fill_height=True,
88
+ )
89
+
90
+
91
+ def main():
92
+ # Parse the arguments
93
+ args = parse_args()
94
+
95
+ # Set OpenAI's API key and API base to use vLLM's API server
96
+ openai_api_key = "EMPTY"
97
+ openai_api_base = args.model_url
98
+
99
+ # Create an OpenAI client
100
+ client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
101
+
102
+ # Define the Gradio chatbot interface using the predict function
103
+ gradio_interface = build_gradio_interface(
104
+ client, args.model, args.temp, args.stop_token_ids
105
+ )
106
+
107
+ gradio_interface.queue().launch(
108
+ server_name=args.host, server_port=args.port, share=True
109
+ )
110
+
111
+
112
+ if __name__ == "__main__":
113
+ main()
114
+
115
+ # python /home/mshahidul/readctrl/code/interface/vllm_app_v2.py --model Qwen/Qwen3-30B-A3B-Instruct-2507 --model-url http://localhost:8004/v1
code/key_subclaims_extract.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import json
3
+ import os
4
+ import tqdm
5
+
6
+ # --- 1. Load Paths and Data ---
7
+ data_path = '/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json'
8
+ prompt_path = "/home/mshahidul/readctrl/prompts/minimum_info_extract _v2"
9
+ api_file = "/home/mshahidul/api_new.json"
10
+ save_path = "/home/mshahidul/readctrl/data/key_subclaims_testing/key_subclaims.json"
11
+
12
+ # Load the dataset
13
+ with open(data_path, 'r') as f:
14
+ dataset = json.load(f)
15
+
16
+ # Load the prompt template
17
+ with open(prompt_path, "r") as f:
18
+ prompt_template = f.read()
19
+
20
+ # Load API Key
21
+ with open(api_file, "r") as f:
22
+ api_keys = json.load(f)
23
+ openai_api_key = api_keys["openai"]
24
+
25
+ client = OpenAI(api_key=openai_api_key)
26
+
27
+ # --- 2. Helper Functions ---
28
+ def openai_return(prompt, model="gpt-5"):
29
+ """Send a prompt to GPT and parse strictly formatted JSON."""
30
+ try:
31
+ response = client.chat.completions.create(
32
+ model=model,
33
+ messages=[
34
+ {"role": "system", "content": "You are a helpful assistant that outputs strictly in JSON format."},
35
+ {"role": "user", "content": prompt}
36
+ ],
37
+ response_format={"type": "json_object"}
38
+ )
39
+ content = response.choices[0].message.content.strip()
40
+ return json.loads(content)
41
+ except Exception as e:
42
+ print(f"⚠️ Error processing API response: {e}")
43
+ return {"error": str(e), "raw_content": content if 'content' in locals() else None}
44
+
45
+ def format_subclaims(subclaim_list, prefix):
46
+ """Formats subclaims with IDs (e.g., ST-1, GS-1) for better LLM tracking."""
47
+ if not isinstance(subclaim_list, list):
48
+ return str(subclaim_list)
49
+ return "\n".join([f"{prefix}-{i+1}: {text}" for i, text in enumerate(subclaim_list)])
50
+
51
+ # --- 3. Main Processing Loop ---
52
+ res = []
53
+ if os.path.exists(save_path):
54
+ with open(save_path, "r") as f:
55
+ res = json.load(f)
56
+
57
+ # Start from where we left off
58
+ start_index = len(res)
59
+ num_to_process = 100
60
+
61
+ for i in tqdm.tqdm(range(start_index, min(start_index + num_to_process, len(dataset)))):
62
+ item = dataset[i]
63
+
64
+ # 1. Extract raw data
65
+ source_text = item.get('fulltext', '')
66
+ source_subclaims_list = item.get('fulltext_subclaims', [])
67
+ gold_summary = item.get('summary', '')
68
+ gold_subclaims_list = item.get('summary_subclaims', [])
69
+
70
+ # 2. Format specifically for the prompt (Mapping IDs like ST-1, GS-1)
71
+ # This helps the LLM return the IDs you requested in your Output Format
72
+ source_subclaims_formatted = format_subclaims(source_subclaims_list, "ST")
73
+ gold_subclaims_formatted = format_subclaims(gold_subclaims_list, "GS")
74
+
75
+ # 3. Inject into prompt
76
+ prompt = prompt_template.replace("<<SOURCE_TEXT>>", source_text)\
77
+ .replace("<<SOURCE_TEXT_SUBCLAIMS>>", source_subclaims_formatted)\
78
+ .replace("<<GOLD_SUMMARY>>", gold_summary)\
79
+ .replace("<<GOLD_SUMMARY_SUBCLAIMS>>", gold_subclaims_formatted)
80
+
81
+ # 4. Call API
82
+ api_response = openai_return(prompt)
83
+
84
+ # 5. Build full result object
85
+ result_entry = {
86
+ "index": i,
87
+ "original_id": item.get('id'),
88
+ "input_data": {
89
+ "source_text": source_text,
90
+ "source_subclaims": source_subclaims_list,
91
+ "gold_summary": gold_summary,
92
+ "gold_subclaims": gold_subclaims_list
93
+ },
94
+ "llm_output": api_response
95
+ }
96
+
97
+ res.append(result_entry)
98
+
99
+ # Autosave every 5 samples
100
+ if len(res) % 5 == 0:
101
+ with open(save_path, "w") as f:
102
+ json.dump(res, f, indent=2, ensure_ascii=False)
103
+
104
+ # Final Save
105
+ with open(save_path, "w") as f:
106
+ json.dump(res, f, indent=2, ensure_ascii=False)
107
+
108
+ print(f"\n✅ Finished! Processed {len(res) - start_index} new samples.")
109
+ print(f"Total samples in {save_path}: {len(res)}")
code/literacy_thresholds.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import json
4
+ import math
5
+ from statistics import median, quantiles
6
+
7
+
8
+ LABEL_ORDER = ["low", "intermediate", "proficient"]
9
+ ORDERED_METRICS = {"source_coverage", "completeness"}
10
+
11
+
12
+ def normalize_label(key: str) -> str:
13
+ key_l = key.lower()
14
+ for label in LABEL_ORDER:
15
+ if label in key_l:
16
+ return label
17
+ return key_l
18
+
19
+
20
+ def five_number_summary(values):
21
+ if not values:
22
+ return None
23
+ q1, _, q3 = quantiles(values, n=4, method="inclusive")
24
+ return {
25
+ "min": min(values),
26
+ "q1": q1,
27
+ "median": median(values),
28
+ "q3": q3,
29
+ "max": max(values),
30
+ }
31
+
32
+
33
+ def remove_outliers_iqr(values):
34
+ if len(values) < 4:
35
+ return values, 0
36
+ q1, _, q3 = quantiles(values, n=4, method="inclusive")
37
+ iqr = q3 - q1
38
+ if math.isclose(iqr, 0.0):
39
+ return values, 0
40
+ lower = q1 - 1.5 * iqr
41
+ upper = q3 + 1.5 * iqr
42
+ filtered = [v for v in values if lower <= v <= upper]
43
+ return filtered, len(values) - len(filtered)
44
+
45
+
46
+ def parse_scores(data, metrics):
47
+ grouped = {label: {m: [] for m in metrics} for label in LABEL_ORDER}
48
+ for item in data:
49
+ levels = item.get("literacy_levels") or {}
50
+ for key, payload in levels.items():
51
+ label = normalize_label(key)
52
+ if label not in grouped:
53
+ continue
54
+ scores = (payload or {}).get("scores") or {}
55
+ for m in metrics:
56
+ if m in scores and scores[m] is not None:
57
+ grouped[label][m].append(scores[m])
58
+ return grouped
59
+
60
+
61
+ def suggest_thresholds(per_label_summaries, label_order):
62
+ thresholds = {}
63
+ for metric in per_label_summaries:
64
+ thresholds[metric] = {}
65
+ for i in range(len(label_order) - 1):
66
+ lower_label = label_order[i]
67
+ upper_label = label_order[i + 1]
68
+ lower = per_label_summaries[metric].get(lower_label)
69
+ upper = per_label_summaries[metric].get(upper_label)
70
+ if not lower or not upper:
71
+ thresholds[metric][f"{lower_label}_to_{upper_label}"] = None
72
+ continue
73
+ if lower["q3"] < upper["q1"]:
74
+ boundary = (lower["q3"] + upper["q1"]) / 2
75
+ else:
76
+ boundary = (lower["median"] + upper["median"]) / 2
77
+ thresholds[metric][f"{lower_label}_to_{upper_label}"] = boundary
78
+ return thresholds
79
+
80
+
81
+ def print_summary(metrics, cleaned_by_label, outlier_counts, summaries):
82
+ for label in LABEL_ORDER:
83
+ print(f"\nLabel: {label}")
84
+ for m in metrics:
85
+ vals = cleaned_by_label[label][m]
86
+ summary = summaries[m].get(label)
87
+ removed = outlier_counts[label][m]
88
+ print(f" Metric: {m}")
89
+ print(f" Count (after outliers): {len(vals)}")
90
+ print(f" Outliers removed: {removed}")
91
+ if summary:
92
+ print(
93
+ " Five-number summary: "
94
+ f"min={summary['min']:.4f}, "
95
+ f"q1={summary['q1']:.4f}, "
96
+ f"median={summary['median']:.4f}, "
97
+ f"q3={summary['q3']:.4f}, "
98
+ f"max={summary['max']:.4f}"
99
+ )
100
+ else:
101
+ print(" Five-number summary: n/a")
102
+
103
+
104
+ def medians_in_order(summaries, metric, label_order):
105
+ medians = []
106
+ for label in label_order:
107
+ summary = summaries.get(metric, {}).get(label)
108
+ if not summary:
109
+ return False
110
+ medians.append(summary["median"])
111
+ return medians[0] <= medians[1] <= medians[2]
112
+
113
+
114
+ def enforce_ordered_metrics(metrics, grouped, cleaned, outlier_counts, summaries):
115
+ for metric in metrics:
116
+ if metric not in ORDERED_METRICS:
117
+ continue
118
+ if medians_in_order(summaries, metric, LABEL_ORDER):
119
+ continue
120
+ for label in LABEL_ORDER:
121
+ raw_values = grouped[label][metric]
122
+ cleaned[label][metric] = raw_values
123
+ outlier_counts[label][metric] = 0
124
+ if raw_values:
125
+ summaries[metric][label] = five_number_summary(raw_values)
126
+
127
+
128
+ def main():
129
+ parser = argparse.ArgumentParser(
130
+ description="Compute five-number summaries by literacy label with outlier removal."
131
+ )
132
+ parser.add_argument(
133
+ "--input",
134
+ default="/home/mshahidul/readctrl/data/factual_testing/full_details_evaluation_0_80_qwen3-30B_v2.json",
135
+ help="Path to JSON evaluation file.",
136
+ )
137
+ parser.add_argument(
138
+ "--metrics",
139
+ default="factual_attribution,completeness,source_coverage",
140
+ help="Comma-separated metrics to analyze.",
141
+ )
142
+ args = parser.parse_args()
143
+
144
+ metrics = [m.strip() for m in args.metrics.split(",") if m.strip()]
145
+ with open(args.input, "r", encoding="utf-8") as f:
146
+ data = json.load(f)
147
+
148
+ grouped = parse_scores(data, metrics)
149
+ cleaned = {label: {} for label in LABEL_ORDER}
150
+ outlier_counts = {label: {} for label in LABEL_ORDER}
151
+ summaries = {m: {} for m in metrics}
152
+
153
+ for label in LABEL_ORDER:
154
+ for m in metrics:
155
+ values = grouped[label][m]
156
+ filtered, removed = remove_outliers_iqr(values)
157
+ cleaned[label][m] = filtered
158
+ outlier_counts[label][m] = removed
159
+ if filtered:
160
+ summaries[m][label] = five_number_summary(filtered)
161
+
162
+ enforce_ordered_metrics(metrics, grouped, cleaned, outlier_counts, summaries)
163
+
164
+ print_summary(metrics, cleaned, outlier_counts, summaries)
165
+ thresholds = suggest_thresholds(summaries, LABEL_ORDER)
166
+
167
+ print("\nSuggested thresholds (based on cleaned quartiles/medians):")
168
+ for m in metrics:
169
+ print(f" Metric: {m}")
170
+ for k, v in thresholds[m].items():
171
+ if v is None:
172
+ print(f" {k}: n/a")
173
+ else:
174
+ print(f" {k}: {v:.4f}")
175
+
176
+
177
+ if __name__ == "__main__":
178
+ main()
code/literacy_thresholds_v2.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import json
4
+ import math
5
+ from statistics import median, quantiles
6
+
7
+
8
+ LABEL_ORDER = ["low", "intermediate", "proficient"]
9
+ TARGET_METRIC = "source_coverage"
10
+ ORDERED_METRICS = {TARGET_METRIC}
11
+
12
+
13
+ def normalize_label(key: str) -> str:
14
+ key_l = key.lower()
15
+ for label in LABEL_ORDER:
16
+ if label in key_l:
17
+ return label
18
+ return key_l
19
+
20
+
21
+ def five_number_summary(values):
22
+ if not values:
23
+ return None
24
+ q1, _, q3 = quantiles(values, n=4, method="inclusive")
25
+ return {
26
+ "min": min(values),
27
+ "q1": q1,
28
+ "median": median(values),
29
+ "q3": q3,
30
+ "max": max(values),
31
+ }
32
+
33
+
34
+ def remove_outliers_iqr(values):
35
+ if len(values) < 4:
36
+ return values, 0
37
+ q1, _, q3 = quantiles(values, n=4, method="inclusive")
38
+ iqr = q3 - q1
39
+ if math.isclose(iqr, 0.0):
40
+ return values, 0
41
+ lower = q1 - 1.5 * iqr
42
+ upper = q3 + 1.5 * iqr
43
+ filtered = [v for v in values if lower <= v <= upper]
44
+ return filtered, len(values) - len(filtered)
45
+
46
+
47
+ def parse_scores(data, metrics):
48
+ grouped = {label: {m: [] for m in metrics} for label in LABEL_ORDER}
49
+ for item in data:
50
+ levels = item.get("literacy_levels") or {}
51
+ for key, payload in levels.items():
52
+ label = normalize_label(key)
53
+ if label not in grouped:
54
+ continue
55
+ scores = (payload or {}).get("scores") or {}
56
+ for m in metrics:
57
+ if m in scores and scores[m] is not None:
58
+ grouped[label][m].append(scores[m])
59
+ return grouped
60
+
61
+
62
+ def suggest_thresholds(per_label_summaries, label_order):
63
+ thresholds = {}
64
+ for metric in per_label_summaries:
65
+ thresholds[metric] = {}
66
+ for i in range(len(label_order) - 1):
67
+ lower_label = label_order[i]
68
+ upper_label = label_order[i + 1]
69
+ lower = per_label_summaries[metric].get(lower_label)
70
+ upper = per_label_summaries[metric].get(upper_label)
71
+ if not lower or not upper:
72
+ thresholds[metric][f"{lower_label}_to_{upper_label}"] = None
73
+ continue
74
+ if lower["q3"] < upper["q1"]:
75
+ boundary = (lower["q3"] + upper["q1"]) / 2
76
+ else:
77
+ boundary = (lower["median"] + upper["median"]) / 2
78
+ thresholds[metric][f"{lower_label}_to_{upper_label}"] = boundary
79
+ return thresholds
80
+
81
+
82
+ def print_summary(metrics, cleaned_by_label, outlier_counts, summaries):
83
+ for label in LABEL_ORDER:
84
+ print(f"\nLabel: {label}")
85
+ for m in metrics:
86
+ vals = cleaned_by_label[label][m]
87
+ summary = summaries[m].get(label)
88
+ removed = outlier_counts[label][m]
89
+ print(f" Metric: {m}")
90
+ print(f" Count (after outliers): {len(vals)}")
91
+ print(f" Outliers removed: {removed}")
92
+ if summary:
93
+ print(
94
+ " Five-number summary: "
95
+ f"min={summary['min']:.4f}, "
96
+ f"q1={summary['q1']:.4f}, "
97
+ f"median={summary['median']:.4f}, "
98
+ f"q3={summary['q3']:.4f}, "
99
+ f"max={summary['max']:.4f}"
100
+ )
101
+ else:
102
+ print(" Five-number summary: n/a")
103
+
104
+
105
+ def medians_in_order(summaries, metric, label_order):
106
+ medians = []
107
+ for label in label_order:
108
+ summary = summaries.get(metric, {}).get(label)
109
+ if not summary:
110
+ return False
111
+ medians.append(summary["median"])
112
+ return medians[0] <= medians[1] <= medians[2]
113
+
114
+
115
+ def enforce_ordered_metrics(metrics, grouped, cleaned, outlier_counts, summaries):
116
+ for metric in metrics:
117
+ if metric not in ORDERED_METRICS:
118
+ continue
119
+ if medians_in_order(summaries, metric, LABEL_ORDER):
120
+ continue
121
+ for label in LABEL_ORDER:
122
+ raw_values = grouped[label][metric]
123
+ cleaned[label][metric] = raw_values
124
+ outlier_counts[label][metric] = 0
125
+ if raw_values:
126
+ summaries[metric][label] = five_number_summary(raw_values)
127
+
128
+
129
+ def main():
130
+ parser = argparse.ArgumentParser(
131
+ description="Compute five-number summaries for source_coverage by literacy label."
132
+ )
133
+ parser.add_argument(
134
+ "--input",
135
+ default="/home/mshahidul/readctrl/data/factual_testing/full_details_evaluation_0_80_qwen3-30B_v2.json",
136
+ help="Path to JSON evaluation file.",
137
+ )
138
+ args = parser.parse_args()
139
+
140
+ metrics = [TARGET_METRIC]
141
+ with open(args.input, "r", encoding="utf-8") as f:
142
+ data = json.load(f)
143
+
144
+ grouped = parse_scores(data, metrics)
145
+ cleaned = {label: {} for label in LABEL_ORDER}
146
+ outlier_counts = {label: {} for label in LABEL_ORDER}
147
+ summaries = {m: {} for m in metrics}
148
+
149
+ for label in LABEL_ORDER:
150
+ for m in metrics:
151
+ values = grouped[label][m]
152
+ filtered, removed = remove_outliers_iqr(values)
153
+ cleaned[label][m] = filtered
154
+ outlier_counts[label][m] = removed
155
+ if filtered:
156
+ summaries[m][label] = five_number_summary(filtered)
157
+
158
+ enforce_ordered_metrics(metrics, grouped, cleaned, outlier_counts, summaries)
159
+
160
+ print_summary(metrics, cleaned, outlier_counts, summaries)
161
+ thresholds = suggest_thresholds(summaries, LABEL_ORDER)
162
+
163
+ print("\nSuggested thresholds (based on cleaned quartiles/medians):")
164
+ for m in metrics:
165
+ print(f" Metric: {m}")
166
+ for k, v in thresholds[m].items():
167
+ if v is None:
168
+ print(f" {k}: n/a")
169
+ else:
170
+ print(f" {k}: {v:.4f}")
171
+
172
+
173
+ if __name__ == "__main__":
174
+ main()
code/old/FH_es.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ # --- Spanish tokenization ---
4
+ WORD_RE = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)
5
+
6
+ def _tokenize_words_es(text: str):
7
+ return WORD_RE.findall(text)
8
+
9
+ def _count_sentences_es(text: str) -> int:
10
+ # Count sentences via ., !, ?, … and Spanish ¡¿
11
+ sentences = re.split(r"[.!?…]+|[¡¿]", text)
12
+ return max(1, sum(1 for s in sentences if s.strip()))
13
+
14
+ # --- Syllable counting ---
15
+ try:
16
+ import pyphen
17
+ _dic = pyphen.Pyphen(lang='es') # or 'es_ES'
18
+
19
+ def count_syllables_es(word: str) -> int:
20
+ # Use hyphenation positions; count pieces
21
+ hyph = _dic.inserted(word)
22
+ return max(1, hyph.count('-') + 1)
23
+ except Exception:
24
+ # Heuristic fallback (handles hiatus and silent 'u' roughly)
25
+ def count_syllables_es(word: str) -> int:
26
+ w = word.lower()
27
+
28
+ # Treat final 'y' as vowel 'i'
29
+ w = re.sub(r'y$', 'i', w)
30
+
31
+ # Remove silent 'u' before e/i in 'que/qui/gue/gui' (but not 'güe/güi')
32
+ w = re.sub(r'que', 'qe', w)
33
+ w = re.sub(r'qui', 'qi', w)
34
+ w = re.sub(r'gue', 'ge', w)
35
+ w = re.sub(r'gui', 'gi', w)
36
+
37
+ vowels = set("aeiouáéíóúü")
38
+ strong = set("aáeéoóíú") # accented í/ú behave like strong (hiatus)
39
+ n = len(w)
40
+ i = 0
41
+ syll = 0
42
+ while i < n:
43
+ if w[i] not in vowels:
44
+ i += 1
45
+ continue
46
+ # collect contiguous vowels
47
+ j = i + 1
48
+ while j < n and w[j] in vowels:
49
+ j += 1
50
+ seq = w[i:j]
51
+ # one nucleus by default
52
+ nuclei = 1
53
+ # split on strong-strong boundaries (ae, ea, ao, oa, eo, oe, and cases with í/ú)
54
+ for k in range(len(seq) - 1):
55
+ if seq[k] in strong and seq[k + 1] in strong:
56
+ nuclei += 1
57
+ syll += nuclei
58
+ i = j
59
+ return max(1, syll)
60
+
61
+ # --- Fernández–Huerta (FH) ---
62
+ def fernandez_huerta(text: str) -> float | None:
63
+ """
64
+ Fernández–Huerta readability for Spanish.
65
+ Higher = easier. Typical range ~0–100.
66
+ """
67
+ words = _tokenize_words_es(text)
68
+ n_words = len(words)
69
+ if n_words == 0:
70
+ return None
71
+ n_sentences = _count_sentences_es(text)
72
+ n_syllables = sum(count_syllables_es(w) for w in words)
73
+
74
+ # FH = 206.84 - 0.60 * (P) - 1.02 * (F)
75
+ # P = (syllables/words)*100, F = words/sentence
76
+ fh = 206.84 - 0.60 * ((n_syllables / n_words) * 100.0) - 1.02 * (n_words / n_sentences)
77
+ return round(fh, 2)
78
+
79
+ # --- Quick check ---
80
+ # if __name__ == "__main__":
81
+ # text_easy = "El corazón es un órgano que bombea sangre. En este caso, funciona bien."
82
+ # text_medium = "El corazón del paciente muestra una función adecuada, aunque se observaron pequeños cambios que deben revisarse."
83
+ # text_hard = "La evaluación cardiológica indicó una función sistólica preservada, con alteraciones discretas en la relajación diastólica."
84
+ # print("Easy FH:", fernandez_huerta(text_easy))
85
+ # print("Medium FH:", fernandez_huerta(text_medium))
86
+ # print("Hard FH:", fernandez_huerta(text_hard))
code/old/FH_esV2.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import separasilabas
3
+
4
+ def count_words(text):
5
+ text = ''.join(filter(lambda x: not x.isdigit(), text))
6
+ clean = re.compile(r'\W+')
7
+ text = clean.sub(' ', text).strip()
8
+ return len(text.split()) if len(text.split()) > 0 else 1
9
+
10
+ def count_sentences(text):
11
+ text = text.replace("\n", "")
12
+ sentence_end = re.compile(r'[.:;!?\)\()]')
13
+ sentences = sentence_end.split(text)
14
+ sentences = list(filter(None, sentences))
15
+ return len(sentences) if len(sentences) > 0 else 1
16
+
17
+ def count_all_syllables(text):
18
+ clean = re.compile(r'\W+')
19
+ words = clean.sub(' ', text).strip().split()
20
+ silabizer = separasilabas.silabizer()
21
+ total = 0
22
+ for word in words:
23
+ total += len(silabizer(word))
24
+ return total if total > 0 else 1
25
+
26
+ def Pval(text):
27
+ syllables = count_all_syllables(text)
28
+ words = count_words(text)
29
+ return round(syllables / words, 2)
30
+
31
+ def Fval(text):
32
+ sentences = count_sentences(text)
33
+ words = count_words(text)
34
+ return round(words / sentences, 2)
35
+
36
+ def fernandez_huerta(text):
37
+ return round(206.84 - 60 * Pval(text) - 1.02 * Fval(text), 2)
38
+
39
+
code/old/FH_fr.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ try:
3
+ import pyphen
4
+ _hyph_fr = pyphen.Pyphen(lang='fr') # or 'fr_FR'
5
+ except Exception:
6
+ _hyph_fr = None
7
+
8
+ # --- Basic French tokenization ---
9
+ WORD_RE_FR = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿœŒÆæ]+", re.UNICODE)
10
+
11
+ def tokenize_words_fr(text: str):
12
+ return WORD_RE_FR.findall(text)
13
+
14
+ def count_sentences_fr(text: str):
15
+ # Split on ., !, ?, … ; keep it simple
16
+ parts = re.split(r"[.!?…]+", text)
17
+ return max(1, sum(1 for p in parts if p.strip()))
18
+
19
+ def count_syllables_fr(word: str) -> int:
20
+ if _hyph_fr:
21
+ # Pyphen gives hyphenation points; count pieces as syllables (approx)
22
+ hyph = _hyph_fr.inserted(word)
23
+ return max(1, hyph.count('-') + 1)
24
+ # Fallback: simple vowel-group heuristic (rougher)
25
+ groups = re.findall(r"[aeiouyAEIOUYàâäéèêëîïôöùûüÿœAEIOUYÀÂÄÉÈÊËÎÏÔÖÙÛÜŸŒ]+", word)
26
+ return max(1, len(groups))
27
+
28
+ # --- FRE-FR (Kandel & Moles) ---
29
+ def flesch_kandel_moles_fr(text: str):
30
+ words = tokenize_words_fr(text)
31
+ W = len(words)
32
+ if W == 0:
33
+ return None
34
+ S = count_sentences_fr(text)
35
+ syl = sum(count_syllables_fr(w) for w in words)
36
+ P = (syl / W) * 100.0 # syllables per 100 words
37
+ F = W / S # words per sentence
38
+ score = 207.0 - 1.015 * F - 0.736 * P
39
+ return round(score, 2)
40
+
41
+ # --- LIX / RIX ---
42
+ def lix(text: str):
43
+ words = tokenize_words_fr(text)
44
+ W = len(words)
45
+ if W == 0:
46
+ return None
47
+ S = count_sentences_fr(text)
48
+ long_words = sum(1 for w in words if len(w) > 6)
49
+ return round((W / S) + (100.0 * long_words / W), 2)
50
+
51
+ def rix(text: str):
52
+ words = tokenize_words_fr(text)
53
+ W = len(words)
54
+ if W == 0:
55
+ return None
56
+ S = count_sentences_fr(text)
57
+ long_words = sum(1 for w in words if len(w) > 6)
58
+ return round(long_words / S, 2)
59
+
60
+ # --- Band checks ---
61
+ FRE_FR_BANDS = {
62
+ 'B1': (70, 100),
63
+ 'B2': (60, 70),
64
+ 'B3': (45, 60),
65
+ }
66
+ LIX_BANDS = {
67
+ 'B1': (20, 35),
68
+ 'B2': (35, 45),
69
+ 'B3': (45, 60),
70
+ }
71
+
72
+ def in_band(score, band, bands, delta=0.0):
73
+ if score is None:
74
+ return False
75
+ lo, hi = bands[band]
76
+ return (lo - delta) <= score <= (hi + delta)
77
+
78
+ # Example
79
+ # if __name__ == "__main__":
80
+ # txt = "Le patient se porte bien. Les examens sont rassurants, sans signes d’infection. Un suivi simple est recommandé."
81
+ # fre = flesch_kandel_moles_fr(txt)
82
+ # lx = lix(txt)
83
+ # rx = rix(txt)
84
+ # print("FRE-FR:", fre, "B1?", in_band(fre, 'B1', FRE_FR_BANDS, delta=1.0))
85
+ # print("LIX:", lx, "B1?", in_band(lx, 'B1', LIX_BANDS, delta=2.0))
86
+ # print("RIX:", rx)
code/old/FH_pt.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ try:
3
+ import pyphen
4
+ _hyph_pt_br = pyphen.Pyphen(lang='pt_BR')
5
+ _hyph_pt_pt = pyphen.Pyphen(lang='pt_PT')
6
+ except Exception:
7
+ _hyph_pt_br = _hyph_pt_pt = None
8
+
9
+ # --- Tokenization ---
10
+ WORD_RE_PT = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+", re.UNICODE) # includes áâãà ç éê í óôõ ú ü etc.
11
+
12
+ def tokenize_words_pt(text: str):
13
+ return WORD_RE_PT.findall(text)
14
+
15
+ def count_sentences_pt(text: str):
16
+ # Keep it simple: ., !, ?, … as boundaries
17
+ parts = re.split(r"[.!?…]+", text)
18
+ return max(1, sum(1 for p in parts if p.strip()))
19
+
20
+ def count_syllables_pt(word: str) -> int:
21
+ # Prefer hyphenation dictionaries (pt_BR first, then pt_PT)
22
+ if _hyph_pt_br or _hyph_pt_pt:
23
+ hyph = (_hyph_pt_br or _hyph_pt_pt).inserted(word)
24
+ return max(1, hyph.count('-') + 1)
25
+ # Fallback: vowel-group heuristic (rough)
26
+ groups = re.findall(r"[aeiouyAEIOUYàáâãéêíóôõúüÀÁÂÃÉÊÍÓÔÕÚÜ]+", word)
27
+ return max(1, len(groups))
28
+
29
+ # --- Flesch Reading Ease (Portuguese adaptation) ---
30
+ def flesch_portuguese(text: str):
31
+ words = tokenize_words_pt(text)
32
+ W = len(words)
33
+ if W == 0:
34
+ return None
35
+ S = count_sentences_pt(text)
36
+ syl = sum(count_syllables_pt(w) for w in words)
37
+ F = W / S # words per sentence
38
+ P = syl / W # syllables per word
39
+ score = 248.835 - 1.015 * F - 84.6 * P
40
+ return round(score, 2)
41
+
42
+ # --- LIX / RIX ---
43
+ def lix(text: str):
44
+ words = tokenize_words_pt(text)
45
+ W = len(words)
46
+ if W == 0:
47
+ return None
48
+ S = count_sentences_pt(text)
49
+ long_words = sum(1 for w in words if len(w) > 6)
50
+ return round((W / S) + (100.0 * long_words / W), 2)
51
+
52
+ def rix(text: str):
53
+ words = tokenize_words_pt(text)
54
+ W = len(words)
55
+ if W == 0:
56
+ return None
57
+ S = count_sentences_pt(text)
58
+ long_words = sum(1 for w in words if len(w) > 6)
59
+ return round(long_words / S, 2)
60
+
61
+ # --- Band checks ---
62
+ FRE_PT_BANDS = {
63
+ 'B1': (70, 100),
64
+ 'B2': (60, 70),
65
+ 'B3': (45, 60),
66
+ }
67
+ LIX_BANDS = {
68
+ 'B1': (20, 35),
69
+ 'B2': (35, 45),
70
+ 'B3': (45, 60),
71
+ }
72
+
73
+ def in_band(score, band, bands, delta=0.0):
74
+ if score is None:
75
+ return False
76
+ lo, hi = bands[band]
77
+ return (lo - delta) <= score <= (hi + delta)
78
+
79
+ # Example
80
+ if __name__ == "__main__":
81
+ txt = "O paciente está bem. Os exames não mostram sinais de infecção. Recomenda-se apenas acompanhamento."
82
+ fre = flesch_portuguese(txt)
83
+ lx = lix(txt)
84
+ rx = rix(txt)
85
+ print("FRE-PT:", fre, "B1?", in_band(fre, 'B1', FRE_PT_BANDS, delta=1.0))
86
+ print("LIX:", lx, "B1?", in_band(lx, 'B1', LIX_BANDS, delta=2.0))
87
+ print("RIX:", rx)
code/old/generate_thinking_data.ipynb ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "d3bff56e",
6
+ "metadata": {},
7
+ "source": [
8
+ "https://lmarena.ai/c/9fa09cff-fb85-4719-80db-188a19de0803"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "1a11463f",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import json\n",
19
+ "import random\n",
20
+ "from typing import List, Dict, Any, Optional\n",
21
+ "\n",
22
+ "# Your existing prompts for different readability levels\n",
23
+ "PROMPTS = {\n",
24
+ " \"easy\": '''\n",
25
+ "You are an assistant that rewrites Spanish texts to make them very simple and easy to understand.\n",
26
+ "Your goal is to rewrite the provided input text for younger readers (Fernández Huerta 70–100; grade 5–7).\n",
27
+ "Use short sentences, simple words, and friendly tone. Avoid technical or complex expressions.\n",
28
+ "Keep all important factual details, but remove jargon.\n",
29
+ "Return only the rewritten text without commentary.\n",
30
+ "''',\n",
31
+ " \"intermediate\": '''\n",
32
+ "You are an assistant specialized in rewriting Spanish texts with medium readability.\n",
33
+ "Your task is to rewrite the provided input text for general or high‑school‑level readers (Fernández Huerta 50–70; grade 8–12).\n",
34
+ "Use clear and complete sentences, moderately complex vocabulary, and structured narration.\n",
35
+ "Retain all relevant medical or factual information, but phrase it in accessible language.\n",
36
+ "Return only the rewritten text with no explanations.\n",
37
+ "''',\n",
38
+ " \"hard\": '''\n",
39
+ "You are an assistant that rewrites Spanish medical texts with professional, technical precision.\n",
40
+ "Rewrite the following input text using specialized, academic terminology and information‑dense phrasing.\n",
41
+ "The output must target a Fernández Huerta readability index between 0 and 50 (university/professional level).\n",
42
+ "Use clinical vocabulary, formal register, and detailed description of pathophysiology, procedures, and findings.\n",
43
+ "Return only the rewritten text.\n",
44
+ "'''\n",
45
+ "}\n",
46
+ "\n",
47
+ "# Thinking templates for processing medical reports\n",
48
+ "THINKING_TEMPLATES = {\n",
49
+ " \"input_analysis\": [\n",
50
+ " \"\"\"Estoy analizando este informe médico. Primero debo identificar:\n",
51
+ "1. Datos del paciente: {patient_info}\n",
52
+ "2. Diagnóstico principal: {diagnosis}\n",
53
+ "3. Síntomas y signos clínicos: {symptoms}\n",
54
+ "4. Pruebas realizadas: {tests}\n",
55
+ "5. Tratamiento: {treatment}\n",
56
+ "\n",
57
+ "Ahora debo adaptar esta información al nivel de lectura solicitado: {difficulty}.\"\"\",\n",
58
+ "\n",
59
+ " \"\"\"Este es un informe médico que necesito reescribir. Contiene:\n",
60
+ "- Información clínica sobre {diagnosis}\n",
61
+ "- Terminología médica como: {medical_terms}\n",
62
+ "- Datos técnicos que debo {action} según el nivel {difficulty}\n",
63
+ "Mi objetivo es mantener la precisión médica mientras ajusto la complejidad del lenguaje.\"\"\"\n",
64
+ " ],\n",
65
+ " \n",
66
+ " \"easy\": [\n",
67
+ " \"\"\"Para nivel fácil (FH 70-100), debo:\n",
68
+ "1. Cambiar \"{medical_term}\" por \"{simple_term}\"\n",
69
+ "2. Dividir oraciones largas en frases cortas\n",
70
+ "3. Eliminar jerga médica innecesaria\n",
71
+ "4. Usar palabras que un niño de 10-12 años entienda\n",
72
+ "5. Mantener la historia clara y simple\n",
73
+ "\n",
74
+ "Voy a contar esto como una historia sobre {patient_description} que {simple_story}.\"\"\",\n",
75
+ "\n",
76
+ " \"\"\"Necesito simplificar mucho este texto:\n",
77
+ "- Cambiar términos médicos complejos por palabras cotidianas\n",
78
+ "- Usar máximo 10-15 palabras por oración\n",
79
+ "- Explicar todo como si fuera para un niño\n",
80
+ "- Mantener solo la información esencial\n",
81
+ "- Hacer que suene amigable y no aterrador\"\"\",\n",
82
+ " ],\n",
83
+ " \n",
84
+ " \"intermediate\": [\n",
85
+ " \"\"\"Para nivel intermedio (FH 50-70), mi estrategia es:\n",
86
+ "1. Mantener algunos términos médicos pero explicarlos brevemente\n",
87
+ "2. Usar oraciones de complejidad media (15-20 palabras)\n",
88
+ "3. Estructurar la información en párrafos lógicos\n",
89
+ "4. Incluir detalles relevantes sin ser excesivamente técnico\n",
90
+ "5. Vocabulario apropiado para estudiantes de secundaria\n",
91
+ "\n",
92
+ "El texto debe ser informativo pero accesible, manteniendo {key_concepts} pero explicando {complex_terms}.\"\"\",\n",
93
+ "\n",
94
+ " \"\"\"Nivel intermedio requiere equilibrio:\n",
95
+ "- Puedo usar términos como \"{medical_term}\" pero debo contextualizarlos\n",
96
+ "- Las oraciones pueden ser más complejas pero claras\n",
97
+ "- Incluir información sobre causas y efectos\n",
98
+ "- Mantener estructura narrativa coherente\n",
99
+ "- Apropiado para lectores con educación media\"\"\",\n",
100
+ " ],\n",
101
+ " \n",
102
+ " \"hard\": [\n",
103
+ " \"\"\"Para nivel profesional (FH 0-50), debo maximizar la precisión técnica:\n",
104
+ "1. Usar nomenclatura médica internacional: {technical_terms}\n",
105
+ "2. Incluir todos los valores de laboratorio y mediciones específicas\n",
106
+ "3. Emplear terminología especializada sin simplificación\n",
107
+ "4. Formato de historia clínica hospitalaria\n",
108
+ "5. Densidad informativa máxima\n",
109
+ "\n",
110
+ "Estructuraré según: Anamnesis → Exploración física → Pruebas complementarias → Diagnóstico → Plan terapéutico.\"\"\",\n",
111
+ "\n",
112
+ " \"\"\"Reescritura altamente técnica requerida:\n",
113
+ "- Incorporar clasificaciones internacionales (CIE-10, DSM-5, etc.)\n",
114
+ "- Detallar fisiopatología y mecanismos moleculares\n",
115
+ "- Usar abreviaturas médicas estándar\n",
116
+ "- Incluir diagnósticos diferenciales\n",
117
+ "- Lenguaje de publicación científica\n",
118
+ "- Máxima densidad de información médica especializada\"\"\",\n",
119
+ " ]\n",
120
+ "}\n",
121
+ "\n",
122
+ "class MedicalReportProcessor:\n",
123
+ " \"\"\"Process medical reports and create training data with thinking mode.\"\"\"\n",
124
+ " \n",
125
+ " def __init__(self, original_report: str):\n",
126
+ " \"\"\"\n",
127
+ " Initialize with the original medical report.\n",
128
+ " \n",
129
+ " Args:\n",
130
+ " original_report: The original medical report text to be rewritten\n",
131
+ " \"\"\"\n",
132
+ " self.original_report = original_report\n",
133
+ " self.medical_entities = self.extract_medical_entities(original_report)\n",
134
+ " \n",
135
+ " def extract_medical_entities(self, text: str) -> Dict[str, List[str]]:\n",
136
+ " \"\"\"Extract medical entities from the report.\"\"\"\n",
137
+ " # This is a simplified extraction - you might want to use a medical NER model\n",
138
+ " entities = {\n",
139
+ " \"diagnosis\": [],\n",
140
+ " \"symptoms\": [],\n",
141
+ " \"medications\": [],\n",
142
+ " \"tests\": [],\n",
143
+ " \"medical_terms\": []\n",
144
+ " }\n",
145
+ " \n",
146
+ " # Common medical terms to look for\n",
147
+ " diagnosis_keywords = [\"diagnóstico\", \"síndrome\", \"enfermedad\", \"trastorno\", \"patología\", \n",
148
+ " \"neurofibromatosis\", \"nf1\", \"tdah\", \"déficit\"]\n",
149
+ " symptom_keywords = [\"dolor\", \"mancha\", \"nódulo\", \"bulto\", \"lesión\", \"síntoma\",\n",
150
+ " \"retraso\", \"dificultad\", \"problema\"]\n",
151
+ " medication_keywords = [\"medicamento\", \"tratamiento\", \"terapia\", \"metilfenidato\", \"fármaco\"]\n",
152
+ " test_keywords = [\"biopsia\", \"ecografía\", \"análisis\", \"prueba\", \"examen\", \"resonancia\"]\n",
153
+ " \n",
154
+ " text_lower = text.lower()\n",
155
+ " \n",
156
+ " # Extract based on keywords\n",
157
+ " for keyword in diagnosis_keywords:\n",
158
+ " if keyword in text_lower:\n",
159
+ " entities[\"diagnosis\"].append(keyword)\n",
160
+ " \n",
161
+ " for keyword in symptom_keywords:\n",
162
+ " if keyword in text_lower:\n",
163
+ " entities[\"symptoms\"].append(keyword)\n",
164
+ " \n",
165
+ " for keyword in medication_keywords:\n",
166
+ " if keyword in text_lower:\n",
167
+ " entities[\"medications\"].append(keyword)\n",
168
+ " \n",
169
+ " for keyword in test_keywords:\n",
170
+ " if keyword in text_lower:\n",
171
+ " entities[\"tests\"].append(keyword)\n",
172
+ " \n",
173
+ " # Extract all medical terms\n",
174
+ " all_medical = diagnosis_keywords + symptom_keywords + medication_keywords + test_keywords\n",
175
+ " for term in all_medical:\n",
176
+ " if term in text_lower:\n",
177
+ " entities[\"medical_terms\"].append(term)\n",
178
+ " \n",
179
+ " return entities\n",
180
+ " \n",
181
+ " def generate_input_thinking(self, difficulty: str) -> str:\n",
182
+ " \"\"\"Generate thinking for understanding the input medical report.\"\"\"\n",
183
+ " template = random.choice(THINKING_TEMPLATES[\"input_analysis\"])\n",
184
+ " \n",
185
+ " thinking = template.format(\n",
186
+ " patient_info=\"paciente de 18 años\" if \"18 años\" in self.original_report else \"paciente\",\n",
187
+ " diagnosis=\", \".join(self.medical_entities[\"diagnosis\"][:2]) or \"condición médica\",\n",
188
+ " symptoms=\", \".join(self.medical_entities[\"symptoms\"][:3]) or \"síntomas diversos\",\n",
189
+ " tests=\", \".join(self.medical_entities[\"tests\"][:2]) or \"estudios clínicos\",\n",
190
+ " treatment=\", \".join(self.medical_entities[\"medications\"][:2]) or \"tratamiento\",\n",
191
+ " difficulty=difficulty,\n",
192
+ " medical_terms=\", \".join(self.medical_entities[\"medical_terms\"][:3]),\n",
193
+ " action=\"simplificar mucho\" if difficulty == \"easy\" else \"adaptar\" if difficulty == \"intermediate\" else \"tecnificar\"\n",
194
+ " )\n",
195
+ " \n",
196
+ " return thinking\n",
197
+ " \n",
198
+ " def generate_output_thinking(self, difficulty: str, rewritten_text: str) -> str:\n",
199
+ " \"\"\"Generate thinking for the rewriting process.\"\"\"\n",
200
+ " template = random.choice(THINKING_TEMPLATES[difficulty])\n",
201
+ " \n",
202
+ " # Customize based on difficulty\n",
203
+ " if difficulty == \"easy\":\n",
204
+ " thinking = template.format(\n",
205
+ " medical_term=self.medical_entities[\"medical_terms\"][0] if self.medical_entities[\"medical_terms\"] else \"término médico\",\n",
206
+ " simple_term=\"enfermedad\" if \"neurofibromatosis\" in self.medical_entities[\"diagnosis\"] else \"problema de salud\",\n",
207
+ " patient_description=\"un joven\",\n",
208
+ " simple_story=\"tenía una enfermedad especial desde pequeño\"\n",
209
+ " )\n",
210
+ " elif difficulty == \"intermediate\":\n",
211
+ " thinking = template.format(\n",
212
+ " key_concepts=\", \".join(self.medical_entities[\"diagnosis\"][:2]) or \"conceptos médicos principales\",\n",
213
+ " complex_terms=\", \".join(self.medical_entities[\"medical_terms\"][:3]) or \"terminología especializada\",\n",
214
+ " medical_term=self.medical_entities[\"medical_terms\"][0] if self.medical_entities[\"medical_terms\"] else \"término médico\"\n",
215
+ " )\n",
216
+ " else: # hard\n",
217
+ " thinking = template.format(\n",
218
+ " technical_terms=\", \".join(self.medical_entities[\"medical_terms\"][:5]) or \"terminología especializada\"\n",
219
+ " )\n",
220
+ " \n",
221
+ " return thinking\n",
222
+ " \n",
223
+ " def create_training_example(self, difficulty: str, rewritten_text: str, fh_score: float) -> Dict:\n",
224
+ " \"\"\"Create a complete training example with thinking.\"\"\"\n",
225
+ " \n",
226
+ " # Generate system message\n",
227
+ " system_content = PROMPTS[difficulty].strip()\n",
228
+ " \n",
229
+ " # Generate thinking for input and output\n",
230
+ " input_thinking = self.generate_input_thinking(difficulty)\n",
231
+ " output_thinking = self.generate_output_thinking(difficulty, rewritten_text)\n",
232
+ " \n",
233
+ " # Create the message structure\n",
234
+ " messages = [\n",
235
+ " {\n",
236
+ " \"content\": f\"reasoning language: Spanish\\n\\n{system_content}\",\n",
237
+ " \"role\": \"system\",\n",
238
+ " \"thinking\": None\n",
239
+ " },\n",
240
+ " {\n",
241
+ " \"content\": f\"Please rewrite the following medical report to achieve a Fernández Huerta score of {fh_score:.1f} (difficulty level: {difficulty}):\\n\\n{self.original_report}\",\n",
242
+ " \"role\": \"user\",\n",
243
+ " \"thinking\": input_thinking\n",
244
+ " },\n",
245
+ " {\n",
246
+ " \"content\": rewritten_text,\n",
247
+ " \"role\": \"assistant\",\n",
248
+ " \"thinking\": output_thinking\n",
249
+ " }\n",
250
+ " ]\n",
251
+ " \n",
252
+ " return {\"messages\": messages}\n",
253
+ "\n",
254
+ "def process_medical_dataset_with_original(\n",
255
+ " original_reports: List[str],\n",
256
+ " readability_versions_list: List[Dict],\n",
257
+ " include_variations: bool = True\n",
258
+ ") -> List[Dict]:\n",
259
+ " \"\"\"\n",
260
+ " Process medical dataset with original reports and create training data.\n",
261
+ " \n",
262
+ " Args:\n",
263
+ " original_reports: List of original medical reports\n",
264
+ " readability_versions_list: List of dictionaries with readability versions\n",
265
+ " include_variations: Whether to include cross-difficulty variations\n",
266
+ " \n",
267
+ " Returns:\n",
268
+ " List of training examples with thinking mode\n",
269
+ " \"\"\"\n",
270
+ " training_dataset = []\n",
271
+ " \n",
272
+ " for original_report, versions_dict in zip(original_reports, readability_versions_list):\n",
273
+ " processor = MedicalReportProcessor(original_report)\n",
274
+ " readability_versions = versions_dict.get(\"readability_versions\", {})\n",
275
+ " \n",
276
+ " # Create training examples for each difficulty level\n",
277
+ " for difficulty, content in readability_versions.items():\n",
278
+ " rewritten_text = content[\"text\"]\n",
279
+ " fh_score = content[\"FH_score\"]\n",
280
+ " \n",
281
+ " training_example = processor.create_training_example(\n",
282
+ " difficulty=difficulty,\n",
283
+ " rewritten_text=rewritten_text,\n",
284
+ " fh_score=fh_score\n",
285
+ " )\n",
286
+ " \n",
287
+ " training_dataset.append(training_example)\n",
288
+ " \n",
289
+ " # Optionally create cross-difficulty variations\n",
290
+ " if include_variations:\n",
291
+ " difficulties = list(readability_versions.keys())\n",
292
+ " \n",
293
+ " # Create some mixed examples (e.g., easy to hard, hard to intermediate)\n",
294
+ " for _ in range(2): # Create 2 variations per report\n",
295
+ " source_diff = random.choice(difficulties)\n",
296
+ " target_diff = random.choice([d for d in difficulties if d != source_diff])\n",
297
+ " \n",
298
+ " # Use source difficulty text as \"original\" for variation\n",
299
+ " source_text = readability_versions[source_diff][\"text\"]\n",
300
+ " target_text = readability_versions[target_diff][\"text\"]\n",
301
+ " target_fh = readability_versions[target_diff][\"FH_score\"]\n",
302
+ " \n",
303
+ " # Create processor for this variation\n",
304
+ " var_processor = MedicalReportProcessor(source_text)\n",
305
+ " variation_example = var_processor.create_training_example(\n",
306
+ " difficulty=target_diff,\n",
307
+ " rewritten_text=target_text,\n",
308
+ " fh_score=target_fh\n",
309
+ " )\n",
310
+ " \n",
311
+ " training_dataset.append(variation_example)\n",
312
+ " \n",
313
+ " return training_dataset\n",
314
+ "\n",
315
+ "# Example usage\n",
316
+ "if __name__ == \"__main__\":\n",
317
+ " # Example original medical reports (these would be your actual original reports)\n",
318
+ " original_medical_reports = [\n",
319
+ " \"\"\"Paciente masculino de 18 años con diagnóstico molecular confirmado de Neurofibromatosis tipo 1 \n",
320
+ " (deleción exones 5-47 del gen NF1), que presenta antecedentes de retraso del desarrollo psicomotor \n",
321
+ " global diagnosticado a los 3 años, trastorno específico del lenguaje de tipo expresivo que requirió \n",
322
+ " intervención fonoaudiológica, y TDAH en tratamiento con metilfenidato 20mg/día con buena respuesta. \n",
323
+ " Hallazgos oftalmológicos incluyen nódulos de Lisch bilaterales, astigmatismo miópico compuesto y \n",
324
+ " euriblefaron bilateral. Motivo de consulta actual: aparición de placa eritematosa de 3cm en muslo \n",
325
+ " izquierdo de 12 meses de evolución y múltiples nódulos subcutáneos móviles no dolorosos en región \n",
326
+ " supraciliar derecha, occipital y muñeca izquierda. Examen físico revela macrocefalia (PC 59cm, >p97), \n",
327
+ " 15 máculas café con leche >1.5cm, efélides axilares e inguinales bilaterales, y 3 máculas \n",
328
+ " rojo-azuladas deprimidas de 0.5-1cm en región lumbar y pectoral derecha. Estudios histopatológicos \n",
329
+ " confirman neurofibromas con inmunohistoquímica S100(+), SOX10(+). Ecografía de partes blandas \n",
330
+ " muestra lesiones hipoecoicas bien delimitadas compatibles con neurofibromas subcutáneos.\"\"\"\n",
331
+ " ]\n",
332
+ " \n",
333
+ " # Your readability versions data\n",
334
+ " readability_data = [\n",
335
+ " {\n",
336
+ " \"readability_versions\": {\n",
337
+ " \"easy\": {\n",
338
+ " \"text\": \"Un joven de 18 años tenía una enfermedad llamada Neurofibromatosis tipo 1 desde que era bebé. Esta enfermedad produce manchas café con leche en la piel y pequeños bultos. Durante su infancia tuvo algunas dificultades para hablar y moverse bien, por lo que recibió terapias especiales. En la adolescencia le dieron medicamentos para mejorar su concentración. A los 18 años fue al dermatólogo porque le salió una nueva mancha en el muslo y algunos bultos en la piel. Le hicieron exámenes y confirmaron que eran parte de su enfermedad. Los médicos clasificaron los distintos tipos de manchas y bultos que tenía en la piel.\",\n",
339
+ " \"FH_score\": 77.16\n",
340
+ " },\n",
341
+ " \"intermediate\": {\n",
342
+ " \"text\": \"Un joven de 18 años con Neurofibromatosis tipo 1, diagnosticada desde el primer año de vida, había presentado dificultades motoras y del lenguaje durante la infancia, además de problemas visuales como nódulos de Lisch y astigmatismo. Fue tratado por Trastorno por Déficit Atencional con buenos resultados académicos. Consultó en Dermatología por una nueva mancha en el muslo izquierdo y la aparición de nódulos en zonas como la muñeca y el cuero cabelludo. En el examen se observaron manchas café con leche, pecas en las axilas y varios bultos pequeños bajo la piel. Se realizaron biopsias y ecografías que confirmaron que las lesiones correspondían a diferentes tipos de neurofibromas superficiales, los cuales fueron clasificados según su forma y localización.\",\n",
343
+ " \"FH_score\": 62.77\n",
344
+ " },\n",
345
+ " \"hard\": {\n",
346
+ " \"text\": \"Varón de 18 años con diagnóstico clínico y molecular de Neurofibromatosis tipo 1 (deleción de exones 5-47 del gen NF1), con antecedentes de retraso psicomotor global, trastorno específico del lenguaje expresivo, TDAH tratado con metilfenidato y hallazgos oftalmológicos compatibles con NF1 (nódulos de Lisch, astigmatismo y euriblefaron). Acude a Dermatología por aparición de placa rosada en muslo izquierdo de un año de evolución y nódulos subcutáneos móviles en región supraciliar derecha, occipital y muñeca. El examen físico revela macrocefalia, múltiples máculas café con leche, efélides axilares y máculas rojo-azuladas deprimidas en región lumbar y pectoral. Las biopsias cutáneas y ecografía de nódulos confirmaron neurofibromas superficiales. Según la clasificación de García-Martínez et al., se diagnosticaron simultáneamente neurofibromas subcutáneos nodulares, cutáneos pseudoatróficos y cutáneos rojo-azulados, evidenciando la heterogeneidad fenotípica de la enfermedad en un mismo paciente.\",\n",
347
+ " \"FH_score\": 39.94\n",
348
+ " }\n",
349
+ " }\n",
350
+ " }\n",
351
+ " ]\n",
352
+ " \n",
353
+ " # Process the dataset with original reports\n",
354
+ " training_dataset = process_medical_dataset_with_original(\n",
355
+ " original_reports=original_medical_reports,\n",
356
+ " readability_versions_list=readability_data,\n",
357
+ " include_variations=True\n",
358
+ " )\n",
359
+ " \n",
360
+ " # Save the training dataset\n",
361
+ " with open(\"medical_report_finetuning_with_thinking.jsonl\", \"w\", encoding=\"utf-8\") as f:\n",
362
+ " for example in training_dataset:\n",
363
+ " f.write(json.dumps(example, ensure_ascii=False) + \"\\n\")\n",
364
+ " \n",
365
+ " # Print example for verification\n",
366
+ " print(\"Example training data with original medical report:\")\n",
367
+ " print(json.dumps(training_dataset[0], ensure_ascii=False, indent=2))\n",
368
+ " \n",
369
+ " # Print statistics\n",
370
+ " print(f\"\\n📊 Dataset Statistics:\")\n",
371
+ " print(f\"Total training examples: {len(training_dataset)}\")\n",
372
+ " print(f\"Number of messages per example: {len(training_dataset[0]['messages'])}\")\n",
373
+ " print(f\"All examples have thinking: {all('thinking' in msg for ex in training_dataset for msg in ex['messages'])}\")\n",
374
+ " \n",
375
+ " # Validate the structure\n",
376
+ " for i, example in enumerate(training_dataset):\n",
377
+ " assert len(example['messages']) == 3, f\"Example {i} doesn't have 3 messages\"\n",
378
+ " assert example['messages'][0]['role'] == 'system', f\"Example {i} first message is not system\"\n",
379
+ " assert example['messages'][1]['role'] == 'user', f\"Example {i} second message is not user\"\n",
380
+ " assert example['messages'][2]['role'] == 'assistant', f\"Example {i} third message is not assistant\"\n",
381
+ " assert 'thinking' in example['messages'][1], f\"Example {i} user message missing thinking\"\n",
382
+ " assert 'thinking' in example['messages'][2], f\"Example {i} assistant message missing thinking\"\n",
383
+ " \n",
384
+ " print(\"✅ All validation checks passed!\")"
385
+ ]
386
+ },
387
+ {
388
+ "cell_type": "markdown",
389
+ "id": "123b65b3",
390
+ "metadata": {},
391
+ "source": [
392
+ "Example training data with original medical report:\n",
393
+ "{\n",
394
+ " \"messages\": [\n",
395
+ " {\n",
396
+ " \"content\": \"reasoning language: Spanish\\n\\nYou are an assistant that rewrites Spanish texts to make them very simple and easy to understand.\\nYour goal is to rewrite the provided input text for younger readers (Fernández Huerta 70–100; grade 5–7).\\nUse short sentences, simple words, and friendly tone. Avoid technical or complex expressions.\\nKeep all important factual details, but remove jargon.\\nReturn only the rewritten text without commentary.\",\n",
397
+ " \"role\": \"system\",\n",
398
+ " \"thinking\": null\n",
399
+ " },\n",
400
+ " {\n",
401
+ " \"content\": \"Please rewrite the following medical report to achieve a Fernández Huerta score of 77.2 (difficulty level: easy):\\n\\nPaciente masculino de 18 años con diagnóstico molecular confirmado de Neurofibromatosis tipo 1 \\n (deleción exones 5-47 del gen NF1), que presenta antecedentes de retraso del desarrollo psicomotor \\n global diagnosticado a los 3 años, trastorno específico del lenguaje de tipo expresivo que requirió \\n intervención fonoaudiológica, y TDAH en tratamiento con metilfenidato 20mg/día con buena respuesta. \\n Hallazgos oftalmológicos incluyen nódulos de Lisch bilaterales, astigmatismo miópico compuesto y \\n euriblefaron bilateral. Motivo de consulta actual: aparición de placa eritematosa de 3cm en muslo \\n izquierdo de 12 meses de evolución y múltiples nódulos subcutáneos móviles no dolorosos en región \\n supraciliar derecha, occipital y muñeca izquierda. Examen físico revela macrocefalia (PC 59cm, >p97), \\n 15 máculas café con leche >1.5cm, efélides axilares e inguinales bilaterales, y 3 máculas \\n rojo-azuladas deprimidas de 0.5-1cm en región lumbar y pectoral derecha. Estudios histopatológicos \\n confirman neurofibromas con inmunohistoquímica S100(+), SOX10(+). Ecografía de partes blandas \\n muestra lesiones hipoecoicas bien delimitadas compatibles con neurofibromas subcutáneos.\",\n",
402
+ " \"role\": \"user\",\n",
403
+ " \"thinking\": \"Estoy analizando este informe médico. Primero debo identificar:\\n1. Datos del paciente: paciente de 18 años\\n2. Diagnóstico principal: diagnóstico, trastorno\\n3. Síntomas y signos clínicos: dolor, nódulo, retraso\\n4. Pruebas realizadas: ecografía, examen\\n5. Tratamiento: tratamiento, metilfenidato\\n\\nAhora debo adaptar esta información al nivel de lectura solicitado: easy.\"\n",
404
+ " },\n",
405
+ " {\n",
406
+ " \"content\": \"Un joven de 18 años tenía una enfermedad llamada Neurofibromatosis tipo 1 desde que era bebé. Esta enfermedad produce manchas café con leche en la piel y pequeños bultos. Durante su infancia tuvo algunas dificultades para hablar y moverse bien, por lo que recibió terapias especiales. En la adolescencia le dieron medicamentos para mejorar su concentración. A los 18 años fue al dermatólogo porque le salió una nueva mancha en el muslo y algunos bultos en la piel. Le hicieron exámenes y confirmaron que eran parte de su enfermedad. Los médicos clasificaron los distintos tipos de manchas y bultos que tenía en la piel.\",\n",
407
+ " \"role\": \"assistant\",\n",
408
+ " \"thinking\": \"Necesito simplificar mucho este texto:\\n- Cambiar términos médicos complejos por palabras cotidianas\\n- Usar máximo 10-15 palabras por oración\\n- Explicar todo como si fuera para un niño\\n- Mantener solo la información esencial\\n- Hacer que suene amigable y no aterrador\"\n",
409
+ " }\n",
410
+ " ]\n",
411
+ "}\n",
412
+ "\n",
413
+ "📊 Dataset Statistics:\n",
414
+ "Total training examples: 5\n",
415
+ "Number of messages per example: 3\n",
416
+ "All examples have thinking: True\n",
417
+ "✅ All validation checks passed!"
418
+ ]
419
+ }
420
+ ],
421
+ "metadata": {
422
+ "kernelspec": {
423
+ "display_name": "unsloth_latest",
424
+ "language": "python",
425
+ "name": "python3"
426
+ },
427
+ "language_info": {
428
+ "codemirror_mode": {
429
+ "name": "ipython",
430
+ "version": 3
431
+ },
432
+ "file_extension": ".py",
433
+ "mimetype": "text/x-python",
434
+ "name": "python",
435
+ "nbconvert_exporter": "python",
436
+ "pygments_lexer": "ipython3",
437
+ "version": "3.11.13"
438
+ }
439
+ },
440
+ "nbformat": 4,
441
+ "nbformat_minor": 5
442
+ }
code/old/readability_controlv2.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
3
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
4
+ import torch
5
+ import time
6
+ import random
7
+
8
+
9
+
10
+ def initialize_and_touch(tensor):
11
+ tensor.zero_()
12
+ torch.cuda.synchronize()
13
+
14
+ def dummy_compute(tensor):
15
+ result = torch.matmul(tensor, tensor.t())
16
+ torch.cuda.synchronize()
17
+ return result
18
+
19
+ device = torch.device("cuda")
20
+ total_memory = torch.cuda.get_device_properties(device).total_memory
21
+ print(f"Total VRAM: {total_memory / (1024**3):.2f} GB")
22
+
23
+ allocated_tensors = []
24
+ chunk_size_bytes = 4 * 1024**3 # 4 GiB
25
+ element_size = torch.tensor([], dtype=torch.float32).element_size()
26
+ chunk_elements = chunk_size_bytes // element_size
27
+
28
+ # Make the chunk roughly square
29
+ side = int(chunk_elements ** 0.5)
30
+
31
+ allocated = 0
32
+ target = total_memory * 0.95
33
+
34
+ print("Allocating and initializing memory...")
35
+ while allocated < target:
36
+ try:
37
+ # Allocate a 2D tensor
38
+ chunk = torch.empty((side, side), dtype=torch.float32, device=device)
39
+ initialize_and_touch(chunk)
40
+ allocated_tensors.append(chunk)
41
+ allocated += chunk_size_bytes
42
+ print(f"Allocated: {allocated / (1024**3):.2f} GB", end='\r')
43
+ except RuntimeError as e:
44
+ if 'out of memory' in str(e).lower():
45
+ print(f"\nOut of memory after {allocated / (1024**3):.2f} GB")
46
+ break
47
+ else:
48
+ raise
49
+
50
+ print(f"\nHolding {allocated / (1024**3):.2f} GB in {len(allocated_tensors)} chunks.")
51
+ print("Running dummy compute every 30 seconds to show GPU utilization...")
52
+
53
+ compute_interval = 30
54
+ last_compute = time.time()
55
+
56
+ while True:
57
+ now = time.time()
58
+ if now - last_compute >= compute_interval:
59
+ if allocated_tensors:
60
+ t = random.choice(allocated_tensors)
61
+ try:
62
+ side = min(t.shape[0], 8000)
63
+ _ = dummy_compute(t[:side, :side])
64
+ print(f"[{time.strftime('%H:%M:%S')}] GPU compute spike (util ↑)")
65
+ except Exception as e:
66
+ print(f"Compute failed (expected if chunk too big): {e}")
67
+ last_compute = now
68
+
69
+ time.sleep(1)
code/old/resonability_check_completeness_openai_V2.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ def return_prompts(reference_summary, generated_summary, subclaims_json, difficulty_level):
3
+ prompt=f'''
4
+ You are a **medical summarization quality evaluator**.
5
+ Your goal is to decide whether the inclusion or omission of each subclaim in the generated summary is *reasonable*, given the target readability level.
6
+
7
+ ---
8
+
9
+ ### **Input**
10
+
11
+ ```
12
+ Readability Level: {difficulty_level}
13
+
14
+ Reference Summary:
15
+ {reference_summary}
16
+
17
+ Generated Summary:
18
+ {generated_summary}
19
+
20
+ Subclaims with Support Results:
21
+ {subclaims_json}
22
+ ```
23
+
24
+ ---
25
+
26
+ ### **Task**
27
+
28
+ For each subclaim:
29
+
30
+ 1. Read `result`:
31
+
32
+ * `1` = the subclaim is supported or clearly mentioned in the generated summary.
33
+ * `0` = the subclaim is missing or not supported.
34
+
35
+ 2. Based on readability level and medical relevance, decide whether this inclusion/omission is **reasonable**, **partially reasonable**, or **unreasonable**.
36
+
37
+ 3. Provide a short justification (1–2 sentences) explaining your reasoning.
38
+
39
+ ---
40
+
41
+ ### **Output Format**
42
+
43
+ Return structured JSON:
44
+
45
+ ```json
46
+ {{
47
+ "readability_level": "<easy/intermediate/hard>",
48
+ "evaluations": [
49
+ {{
50
+ "subclaim_id": <id>,
51
+ "subclaim_text": "<text>",
52
+ "result": <0 or 1>,
53
+ "reasonableness": "<reasonable | partially_reasonable | unreasonable>",
54
+ "justification": "<short explanation>"
55
+ }},
56
+ ...
57
+ ]
58
+ }}
59
+ ```
60
+
61
+ ---
62
+
63
+ ### **Evaluation Guidelines**
64
+
65
+ | Readability Level | Reasonable Omission | Unreasonable Omission |
66
+ | ----------------- | ------------------------------------------------------------ | ------------------------------------------------- |
67
+ | **Easy** | Technical, anatomical, quantitative, or procedural details. | Key clinical findings, diagnoses, or outcomes. |
68
+ | **Intermediate** | Minor imaging details or measurements. | Any main diagnostic finding or cause–effect link. |
69
+ | **Hard** | Very few omissions acceptable; mostly stylistic compression. | Any missing clinical or diagnostic information. |
70
+
71
+ '''
72
+ return prompt
73
+
74
+ from openai import OpenAI
75
+
76
+ file_path = "/home/mshahidul/api_new.json"
77
+ with open(file_path, "r") as file:
78
+ api_keys = json.load(file)
79
+
80
+ openai_api_key = api_keys.get("openai")
81
+
82
+ client = OpenAI(api_key=openai_api_key)
83
+ def openai_return(prompt):
84
+ response = client.chat.completions.create(
85
+ model="gpt-5",
86
+ messages=[
87
+ {"role": "system", "content": "You are a helpful assistant."},
88
+ {"role": "user", "content": prompt}
89
+ ]
90
+ )
91
+ cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
92
+ return json.loads(cleaned_response)
93
+
94
+ import json
95
+ file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"
96
+
97
+ with open(file_path, 'r') as f:
98
+ synthetic_data = json.load(f)
99
+
100
+ file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"
101
+
102
+ with open(file_path_qwen3_32B, 'r') as f:
103
+ qwen3_32B_results = json.load(f)
104
+
105
+ # dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions'])
106
+ # print(f"Full text: {synthetic_data[0]['full_text']}")
107
+ res=[]
108
+ save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/syn_data_resonability_check_20_gpt5.json"
109
+ if os.path.exists(save_path):
110
+ with open(save_path, 'r') as f:
111
+ res = json.load(f)
112
+ exist_check_ids = set([(item['id'], item['difficulty_level']) for item in res])
113
+ print(f"Resuming from {len(res)} entries")
114
+ import tqdm
115
+ for ind in tqdm.tqdm(range(0,20)):
116
+ print(f"Processing index: {ind}")
117
+ for version in ["easy", "intermediate", "hard"]:
118
+ if (synthetic_data[ind]['id'], version) in exist_check_ids:
119
+ print(f"Skipping {synthetic_data[ind]['id']} {version}")
120
+ continue
121
+ ref_summary = (f"{synthetic_data[ind]['ref_summary']['text']}")
122
+ generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}")
123
+ subclaims_results = (f"{qwen3_32B_results[ind]['completeness']['results']}")
124
+ try:
125
+ prompt = return_prompts(ref_summary, generated_summary, subclaims_results, version)
126
+ res.append({
127
+ "id": synthetic_data[ind]['id'],
128
+ "difficulty_level": version,
129
+ "reasonableness": openai_return(prompt)
130
+ })
131
+ if len(res)%2==0:
132
+ print(f"Completed {len(res)} out of 300")
133
+ with open(save_path, 'w') as outfile:
134
+ json.dump(res, outfile, indent=2)
135
+ except Exception as e:
136
+ print(f"Error at {ind} {version}: {e}")
137
+ # print(prompt)
138
+ # assert False
139
+ with open(save_path, 'w') as outfile:
140
+ json.dump(res, outfile, indent=2)
code/old/resonability_check_completeness_openai_V3.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ def return_prompts(reference_summary, generated_summary, subclaims_json, difficulty_level):
3
+ prompt=f'''
4
+ You are a **medical summarization quality evaluator**.
5
+ Your goal is to decide whether the inclusion or omission of each subclaim in the generated summary is *reasonable*, given the target readability level.
6
+
7
+ ---
8
+
9
+ ### **Input**
10
+
11
+ ```
12
+ Readability Level: {difficulty_level}
13
+
14
+ Reference Summary:
15
+ {reference_summary}
16
+
17
+ Generated Summary:
18
+ {generated_summary}
19
+
20
+ Subclaims with Support Results:
21
+ {subclaims_json}
22
+ ```
23
+
24
+ ---
25
+
26
+ ### **Task**
27
+
28
+ For each subclaim:
29
+
30
+ 1. Read `result`:
31
+
32
+ * `1` = the subclaim is supported or clearly mentioned in the generated summary.
33
+ * `0` = the subclaim is missing or not supported.
34
+
35
+ 2. Based on readability level and medical relevance, decide whether this inclusion/omission is **reasonable**, **partially reasonable**, or **unreasonable**.
36
+
37
+ 3. Provide a short justification (1–2 sentences) explaining your reasoning.
38
+
39
+ ---
40
+
41
+ ### **Output Format**
42
+
43
+ Return structured JSON:
44
+
45
+ ```json
46
+ {{
47
+ "readability_level": "<easy/intermediate/hard>",
48
+ "evaluations": [
49
+ {{
50
+ "subclaim_id": <id>,
51
+ "subclaim_text": "<text>",
52
+ "result": <0 or 1>,
53
+ "reasonableness": "<reasonable | partially_reasonable | unreasonable>",
54
+ "justification": "<short explanation>"
55
+ }},
56
+ ...
57
+ ]
58
+ }}
59
+ ```
60
+
61
+ ---
62
+
63
+ ### **Evaluation Guidelines**
64
+
65
+ | Readability Level | Reasonable Omission | Unreasonable Omission |
66
+ | ----------------- | ------------------------------------------------------------ | ------------------------------------------------- |
67
+ | **Easy** | Technical, anatomical, quantitative, or procedural details. | Key clinical findings, diagnoses, or outcomes. |
68
+ | **Intermediate** | Minor imaging details or measurements. | Any main diagnostic finding or cause–effect link. |
69
+ | **Hard** | Very few omissions acceptable; mostly stylistic compression. | Any missing clinical or diagnostic information. |
70
+
71
+ '''
72
+ return prompt
73
+
74
+ from openai import OpenAI
75
+
76
+ file_path = "/home/mshahidul/api_new.json"
77
+ with open(file_path, "r") as file:
78
+ api_keys = json.load(file)
79
+
80
+ openai_api_key = api_keys.get("openai")
81
+
82
+ client = OpenAI(api_key=openai_api_key)
83
+ def openai_return(prompt):
84
+ response = client.chat.completions.create(
85
+ model="gpt-5",
86
+ messages=[
87
+ {"role": "system", "content": "You are a helpful assistant."},
88
+ {"role": "user", "content": prompt}
89
+ ]
90
+ )
91
+ cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
92
+ return json.loads(cleaned_response)
93
+
94
+ import json
95
+ file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"
96
+
97
+ with open(file_path, 'r') as f:
98
+ synthetic_data = json.load(f)
99
+
100
+ file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"
101
+
102
+ with open(file_path_qwen3_32B, 'r') as f:
103
+ qwen3_32B_results = json.load(f)
104
+
105
+ # dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions'])
106
+ # print(f"Full text: {synthetic_data[0]['full_text']}")
107
+ res=[]
108
+ save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/syn_data_resonability_check_20_gpt5.json"
109
+ if os.path.exists(save_path):
110
+ with open(save_path, 'r') as f:
111
+ res = json.load(f)
112
+ exist_check_ids = set([(item['id'], item['difficulty_level']) for item in res])
113
+ print(f"Resuming from {len(res)} entries")
114
+ import tqdm
115
+ for ind in tqdm.tqdm(range(0,20)):
116
+ print(f"Processing index: {ind}")
117
+ for version in ["easy", "intermediate", "hard"]:
118
+ if (synthetic_data[ind]['id'], version) in exist_check_ids:
119
+ print(f"Skipping {synthetic_data[ind]['id']} {version}")
120
+ continue
121
+ ref_summary = (f"{synthetic_data[ind]['ref_summary']['text']}")
122
+ generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}")
123
+ subclaims_results = (f"{qwen3_32B_results[ind]['completeness']['results']}")
124
+ try:
125
+ prompt = return_prompts(ref_summary, generated_summary, subclaims_results, version)
126
+ res.append({
127
+ "id": synthetic_data[ind]['id'],
128
+ "difficulty_level": version,
129
+ "reasonableness": openai_return(prompt)
130
+ })
131
+ if len(res)%2==0:
132
+ print(f"Completed {len(res)} out of 300")
133
+ with open(save_path, 'w') as outfile:
134
+ json.dump(res, outfile, indent=2)
135
+ except Exception as e:
136
+ print(f"Error at {ind} {version}: {e}")
137
+ # print(prompt)
138
+ # assert False
139
+ with open(save_path, 'w') as outfile:
140
+ json.dump(res, outfile, indent=2)
code/old/synthetic_data_generationV3.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ from openai import OpenAI
5
+ import tqdm
6
+
7
+
8
+ client = OpenAI(api_key=json.load(open('/home/mshahidul/api.json', 'r'))['openai_api_key'])
9
+
10
+
11
+ # MODIFICATION: Create a dictionary to hold prompts for multiple languages.
12
+ ALL_PROMPTS = {
13
+ "en": {
14
+ "B1": """You are a summarization assistant. Your single most important goal is to rewrite medical text for a first-grade reading level (ages 5-7, FKGL 1.0-4.0). Simplicity is more important than detail.
15
+
16
+ Core Mandate:
17
+ - TARGET AUDIENCE: A 6-year-old child.
18
+ - PRIMARY GOAL: Extreme simplicity. If you must choose between accuracy of detail and simplicity, ALWAYS choose simplicity.
19
+
20
+ Strict Rules You Must Follow:
21
+ - SENTENCE LENGTH: Keep almost all sentences under 10 words. Use very short, simple sentences.
22
+ - VOCABULARY: Use only very common, everyday words that a first-grader would know. Avoid any medical or scientific terms. Instead of 'femur', say 'thigh bone'. Instead of 'benign', say 'not harmful'.
23
+ - TONE: Be very gentle, calm, and reassuring. Like a kind doctor explaining something to a small child.
24
+ - STRUCTURE: Use short paragraphs, often just one or two sentences long.
25
+ - FOCUS: Only mention the most important one or two points from the original text. Omit all other details.
26
+
27
+ - Never use emojis.
28
+ - Do not explain pronunciation.
29
+ - DO NOT use any medical jargon.
30
+ """,
31
+ "B2": """You are a summarization assistant trained to rewrite medical summaries for a middle school reading level (ages 11–14, FKGL 6.0–9.0). Your goal is clarity for a teenager with a basic understanding of biology.
32
+
33
+ Core Mandate:
34
+ - TARGET AUDIENCE: A 14-year-old in a 9th-grade biology class.
35
+ - PRIMARY GOAL: Clarity and straightforward explanation.
36
+
37
+ Strict Rules You Must Follow:
38
+ - SENTENCE LENGTH: Vary sentence length, but aim for an average of 12-18 words. Avoid long, complex sentences.
39
+ - VOCABULARY: You can use basic medical terms (e.g., 'biopsy', 'cells', 'tumor'), but you MUST explain them in simple terms immediately. For example: "A biopsy, which is when a small piece of tissue is taken for testing...".
40
+ - TONE: Be empathetic but direct. Use an educational and informative tone, like a science teacher.
41
+ - STRUCTURE: Organize the summary into logical paragraphs. You can use simple headings if it helps clarity (e.g., "What They Found," "What It Means").
42
+ - FOCUS: Summarize the main findings and their implications. Omit minor or highly technical details.
43
+
44
+ - Never use emojis.
45
+ - Do not explain pronunciation.
46
+ """,
47
+ "B3": """You are a summarization assistant trained to rewrite medical summaries for an educated, non-medical adult (ages 17+, FKGL 12.0+). Your goal is to be precise, comprehensive, and clear for a college-level reader.
48
+
49
+ Core Mandate:
50
+ - TARGET AUDIENCE: A curious college student or adult with no medical training.
51
+ - PRIMARY GOAL: Precision and structured clarity.
52
+
53
+ Strict Rules You Must Follow:
54
+ - SENTENCE LENGTH: Use clear, well-constructed sentences. Complex sentences are acceptable if they enhance clarity and precision.
55
+ - VOCABULARY: Use correct medical terminology. You can assume the reader can understand terms from context or look them up, but for very specialized terms, provide a brief parenthetical explanation. For example: "...showed evidence of hyperplasia (an increase in the number of cells)."
56
+ - TONE: Maintain a professional, empathetic, and respectful tone. Be authoritative but not clinical or cold.
57
+ - STRUCTURE: Provide a detailed and structured summary. Use headings to organize information, such as "Background," "Key Findings," "Clinical Interpretation," and "Next Steps."
58
+ - FOCUS: Be comprehensive and faithful to the source summary. Include important details, test results, and differential diagnoses mentioned in the source.
59
+
60
+ - Never use emojis.
61
+ - Do not explain pronunciation.
62
+ """
63
+ },
64
+ "es": {
65
+ "B1": """Eres un asistente de resumen. Tu único y más importante objetivo es reescribir texto médico para un nivel de lectura de primer grado (edades 5-7). La simplicidad es más importante que el detalle.
66
+
67
+ Mandato Principal:
68
+ - PÚBLICO OBJETIVO: Un niño de 6 años.
69
+ - OBJETIVO PRIMARIO: Simplicidad extrema. Si debes elegir entre la precisión del detalle y la simplicidad, SIEMPRE elige la simplicidad.
70
+
71
+ Reglas Estrictas que Debes Seguir:
72
+ - IDIOMA: El resumen DEBE estar escrito en español.
73
+ - LONGITUD DE LA ORACIÓN: Casi todas las oraciones deben tener menos de 10 palabras. Usa frases muy cortas y simples.
74
+ - VOCABULARIO: Usa solo palabras cotidianas y muy comunes que un niño de primer grado conocería. Evita cualquier término médico o científico. En lugar de 'fémur', di 'hueso del muslo'. En lugar de 'benigno', di 'que no es dañino'.
75
+ - TONO: Sé muy gentil, calmado y tranquilizador. Como un doctor amable explicándole algo a un niño pequeño.
76
+ - ESTRUCTURA: Usa párrafos cortos, a menudo de solo una o dos oraciones.
77
+ - ENFOQUE: Menciona solo el punto más importante o los dos puntos más importantes del texto original. Omite todos los demás detalles.
78
+
79
+ - Nunca uses emojis.
80
+ - No expliques la pronunciación.
81
+ - NO uses jerga médica.
82
+ """,
83
+ "B2": """Eres un asistente de resumen entrenado para reescribir resúmenes médicos para un nivel de lectura de secundaria (edades 11–14). Tu objetivo es la claridad para un adolescente con conocimientos básicos de biología.
84
+
85
+ Mandato Principal:
86
+ - PÚBLICO OBJETIVO: Un estudiante de 14 años en una clase de biología de secundaria.
87
+ - OBJETIVO PRIMARIO: Claridad y explicación directa.
88
+
89
+ Reglas Estrictas que Debes Seguir:
90
+ - IDIOMA: El resumen DEBE estar escrito en español.
91
+ - LONGITUD DE LA ORACIÓN: Varía la longitud de las oraciones, pero busca un promedio de 12-18 palabras. Evita las oraciones largas y complejas.
92
+ - VOCABULARIO: Puedes usar términos médicos básicos (ej., 'biopsia', 'células', 'tumor'), pero DEBES explicarlos en términos sencillos inmediatamente. Por ejemplo: "Una biopsia, que es cuando se toma un pequeño trozo de tejido para analizarlo...".
93
+ - TONO: Sé empático pero directo. Usa un tono educativo e informativo, como un profesor de ciencias.
94
+ - ESTRUCTURA: Organiza el resumen en párrafos lógicos. Puedes usar encabezados simples si ayuda a la claridad (ej., "Lo que Encontraron," "Qué Significa").
95
+ - ENFOQUE: Resume los hallazgos principales y sus implicaciones. Omite detalles menores o muy técnicos.
96
+
97
+ - Nunca uses emojis.
98
+ - No expliques la pronunciación.
99
+ """,
100
+ "B3": """Eres un asistente de resumen entrenado para reescribir resúmenes médicos para un adulto educado no médico (edades 17+). Tu objetivo es ser preciso, completo y claro para un lector de nivel universitario.
101
+
102
+ Mandato Principal:
103
+ - PÚBLICO OBJETIVO: Un estudiante universitario o un adulto curioso sin formación médica.
104
+ - OBJETIVO PRIMARIO: Precisión y claridad estructurada.
105
+
106
+ Reglas Estrictas que Debes Seguir:
107
+ - IDIOMA: El resumen DEBE estar escrito en español.
108
+ - LONGITUD DE LA ORACIÓN: Usa oraciones claras y bien construidas. Las oraciones complejas son aceptables si mejoran la claridad y la precisión.
109
+ - VOCABULARIO: Usa la terminología médica correcta. Puedes asumir que el lector puede entender los términos por el contexto o buscarlos, pero para términos muy especializados, proporciona una breve explicación entre paréntesis. Por ejemplo: "...mostró evidencia de hiperplasia (un aumento en el número de células)."
110
+ - TONO: Mantén un tono profesional, empático y respetuoso. Sé autoritario pero no clínico o frío.
111
+ - ESTRUCTURA: Proporciona un resumen detallado y estructurado. Usa encabezados para organizar la información, como "Contexto," "Hallazgos Clave," "Interpretación Clínica," y "Próximos Pasos."
112
+ - ENFOQUE: Sé completo y fiel al resumen original. Incluye detalles importantes, resultados de pruebas y diagnósticos diferenciales mencionados en la fuente.
113
+
114
+ - Nunca uses emojis.
115
+ - No expliques la pronunciación.
116
+ """
117
+ },
118
+ "fr": {
119
+ "B1": """Vous êtes un assistant de résumé. Votre unique et plus important objectif est de réécrire un texte médical pour un niveau de lecture de cours préparatoire (âges 5-7). La simplicité est plus importante que le détail.
120
+
121
+ Mandat Principal :
122
+ - PUBLIC CIBLE : Un enfant de 6 ans.
123
+ - OBJECTIF PRINCIPAL : Simplicité extrême. Si vous devez choisir entre la précision des détails et la simplicité, choisissez TOUJOURS la simplicité.
124
+
125
+ Règles Strictes à Suivre Impérativement :
126
+ - LANGUE : Le résumé DOIT être rédigé en français.
127
+ - LONGUEUR DES PHRASES : Presque toutes les phrases doivent faire moins de 10 mots. Utilisez des phrases très courtes et simples.
128
+ - VOCABULAIRE : Utilisez uniquement des mots très courants et quotidiens qu'un enfant de cet âge connaîtrait. Évitez tout terme médical ou scientifique. Au lieu de 'fémur', dites 'l'os de la cuisse'. Au lieu de 'bénin', dites 'pas dangereux'.
129
+ - TON : Soyez très doux, calme et rassurant. Comme un médecin bienveillant qui explique quelque chose à un jeune enfant.
130
+ - STRUCTURE : Utilisez des paragraphes courts, souvent composés d'une ou deux phrases seulement.
131
+ - ENFOQUE : Mentionnez uniquement le ou les deux points les plus importants du texte original. Omettez tous les autres détails.
132
+
133
+ - N'utilisez jamais d'emojis.
134
+ - N'expliquez pas la prononciation.
135
+ - N'utilisez AUCUN jargon médical.
136
+ """,
137
+ "B2": """Vous êtes un assistant de résumé entraîné à réécrire des résumés médicaux pour un niveau de lecture de collège (âges 11–14). Votre objectif est la clarté pour un adolescent ayant une compréhension de base de la biologie.
138
+
139
+ Mandat Principal :
140
+ - PUBLIC CIBLE : Un adolescent de 14 ans en classe de biologie au collège.
141
+ - OBJECTIF PRINCIPAL : Clarté et explication directe.
142
+
143
+ Règles Strictes à Suivre Impérativement :
144
+ - LANGUE : Le résumé DOIT être rédigé en français.
145
+ - LONGUEUR DES PHRASES : Variez la longueur des phrases, mais visez une moyenne de 12-18 mots. Évitez les phrases longues et complexes.
146
+ - VOCABULAIRE : Vous pouvez utiliser des termes médicaux de base (ex: 'biopsie', 'cellules', 'tumeur'), mais vous DEVEZ les expliquer en termes simples immédiatement. Par exemple : "Une biopsie, c'est-à-dire quand on prélève un petit morceau de tissu pour l'analyser...".
147
+ - TON : Soyez empathique mais direct. Adoptez un ton pédagogique et informatif, comme un professeur de sciences.
148
+ - STRUCTURE : Organisez le résumé en paragraphes logiques. Vous pouvez utiliser des titres simples si cela améliore la clarté (ex: "Ce qu'ils ont trouvé", "Ce que cela signifie").
149
+ - ENFOQUE : Résumez les principales observations et leurs implications. Omettez les détails mineurs ou très techniques.
150
+
151
+ - N'utilisez jamais d'emojis.
152
+ - N'expliquez pas la prononciation.
153
+ """,
154
+ "B3": """Vous êtes un assistant de résumé entraîné à réécrire des résumés médicaux pour un adulte éduqué non-médecin (âges 17+). Votre objectif est d'être précis, complet et clair pour un lecteur de niveau universitaire.
155
+
156
+ Mandat Principal :
157
+ - PUBLIC CIBLE : Un étudiant ou un adulte curieux sans formation médicale.
158
+ - OBJECTIF PRINCIPAL : Précision et clarté structurée.
159
+
160
+ Règles Strictes à Suivre Impérativement :
161
+ - LANGUE : Le résumé DOIT être rédigé en français.
162
+ - LONGUEUR DES PHRASES : Utilisez des phrases claires et bien construites. Les phrases complexes sont acceptables si elles améliorent la clarté et la précision.
163
+ - VOCABULAIRE : Utilisez la terminologie médicale correcte. Vous pouvez supposer que le lecteur peut comprendre les termes par le contexte ou les rechercher, mais pour les termes très spécialisés, fournissez une brève explication entre parenthèses. Par exemple : "...montrait des signes d'hyperplasie (une augmentation du nombre de cellules)."
164
+ - TON : Maintenez un ton professionnel, empathique et respectueux. Soyez directif mais ni clinique ni froid.
165
+ - STRUCTURE : Fournissez un résumé détaillé et structuré. Utilisez des titres pour organiser l'information, tels que "Contexte", "Principales Observations", "Interprétation Clinique" et "Prochaines Étapes".
166
+ - ENFOQUE : Soyez complet et fidèle au résumé source. Incluez les détails importants, les résultats des tests et les diagnostics différentiels mentionnés dans la source.
167
+
168
+ - N'utilisez jamais d'emojis.
169
+ - N'expliquez pas la prononciation.
170
+ """
171
+ },
172
+
173
+ "pt": {
174
+ "B1": """Você é um assistente de resumo. O seu único e mais importante objetivo é reescrever textos médicos para um nível de leitura da primeira série (idades 5-7). A simplicidade é mais importante que os detalhes.
175
+
176
+ Mandato Principal:
177
+ - PÚBLICO-ALVO: Uma criança de 6 anos.
178
+ - OBJETIVO PRINCIPAL: Simplicidade extrema. Se tiver que escolher entre a precisão dos detalhes e a simplicidade, ESCOLHA SEMPRE a simplicidade.
179
+
180
+ Regras Rígidas que Você Deve Seguir:
181
+ - IDIOMA: O resumo DEVE ser escrito em português.
182
+ - COMPRIMENTO DAS FRASES: Quase todas as frases devem ter menos de 10 palavras. Use frases muito curtas e simples.
183
+ - VOCABULÁRIO: Use apenas palavras quotidianas e muito comuns que uma criança da primeira série conheceria. Evite qualquer termo médico ou científico. Em vez de 'fêmur', diga 'o osso da coxa'. Em vez de 'benigno', diga 'que não faz mal'.
184
+ - TOM: Seja muito gentil, calmo e tranquilizador. Como um médico amável a explicar algo a uma criança pequena.
185
+ - ESTRUTURA: Use parágrafos curtos, muitas vezes com apenas uma ou duas frases.
186
+ - FOCO: Mencione apenas um ou dois dos pontos mais importantes do texto original. Omita todos os outros detalhes.
187
+
188
+ - Nunca use emojis.
189
+ - Não explique a pronúncia.
190
+ - NÃO use NENHUM jargão médico.
191
+ """,
192
+ "B2": """Você é um assistente de resumo treinado para reescrever resumos médicos para um nível de leitura do ensino fundamental II (idades 11–14). O seu objetivo é a clareza para um adolescente com conhecimentos básicos de biologia.
193
+
194
+ Mandato Principal:
195
+ - PÚBLICO-ALVO: Um adolescente de 14 anos numa aula de biologia.
196
+ - OBJETIVO PRINCIPAL: Clareza e explicação direta.
197
+
198
+ Regras Rígidas que Você Deve Seguir:
199
+ - IDIOMA: O resumo DEVE ser escrito em português.
200
+ - COMPRIMENTO DAS FRASES: Varie o comprimento das frases, mas procure uma média de 12 a 18 palavras. Evite frases longas e complexas.
201
+ - VOCABULÁRIO: Pode usar termos médicos básicos (ex: 'biópsia', 'células', 'tumor'), mas você DEVE explicá-los em termos simples imediatamente. Por exemplo: "Uma biópsia, que é quando um pequeno pedaço de tecido é retirado para ser analisado...".
202
+ - TOM: Seja empático, mas direto. Use um tom educativo e informativo, como um professor de ciências.
203
+ - ESTRUTURA: Organize o resumo em parágrafos lógicos. Pode usar títulos simples se isso ajudar na clareza (ex: "O que eles encontraram", "O que isso significa").
204
+ - FOCO: Resuma os principais achados e as suas implicações. Omita detalhes menores ou muito técnicos.
205
+
206
+ - Nunca use emojis.
207
+ - Não explique a pronúncia.
208
+ """,
209
+ "B3": """Você é um assistente de resumo treinado para reescrever resumos médicos para um adulto instruído, mas sem formação médica (idades 17+). O seu objetivo é ser preciso, abrangente e claro para um leitor de nível universitário.
210
+
211
+ Mandato Principal:
212
+ - PÚBLICO-ALVO: Um estudante universitário ou adulto curioso sem formação médica.
213
+ - OBJETIVO PRINCIPAL: Precisão e clareza estruturada.
214
+
215
+ Regras Rígidas que Você Deve Seguir:
216
+ - IDIOMA: O resumo DEVE ser escrito em português.
217
+ - COMPRIMENTO DAS FRASES: Use frases claras e bem construídas. Frases complexas são aceitáveis se melhorarem a clareza e a precisão.
218
+ - VOCABULÁRIO: Use a terminologia médica correta. Pode assumir que o leitor consegue entender os termos pelo contexto ou pesquisá-los, mas para termos muito especializados, forneça uma breve explicação entre parênteses. Por exemplo: "...mostrou evidência de hiperplasia (um aumento no número de células)."
219
+ - TOM: Mantenha um tom profissional, empático e respeitoso. Seja confiante, mas não clínico ou frio.
220
+ - ESTRUTURA: Forneça um resumo detalhado e estruturado. Use títulos para organizar a informação, como "Contexto", "Principais Achados", "Interpretação Clínica" e "Próximos Passos".
221
+ - FOCO: Seja abrangente e fiel ao resumo original. Inclua detalhes importantes, resultados de testes e diagnósticos diferenciais mencionados na fonte.
222
+
223
+ - Nunca use emojis.
224
+ - Não explique a pronúncia.
225
+ """
226
+ }
227
+
228
+ }
229
+ USER_PROMPT_TEMPLATES = {
230
+ "en": """Please rewrite the following expert summary for the specified target audience. Use the full article for context if needed.
231
+ **Full Article Context:**
232
+ {article}
233
+ **Expert Summary to Rewrite:**
234
+ {gold_summary}
235
+ """,
236
+ "es": """Por favor, reescribe el siguiente resumen de experto para el público objetivo especificado. Usa el artículo completo como contexto si es necesario.
237
+ **Contexto del Artículo Completo:**
238
+ {article}
239
+ **Resumen de Experto a Reescribir:**
240
+ {gold_summary}
241
+ """,
242
+ "fr": """Veuillez réécrire le résumé d'expert suivant pour le public cible spécifié. Utilisez l'article complet comme contexte si nécessaire.
243
+ **Contexte de l'Article Complet :**
244
+ {article}
245
+ **Résumé d'Expert à Réécrire :**
246
+ {gold_summary}
247
+ """,
248
+ "pt": """Por favor, reescreva o seguinte resumo de especialista para o público-alvo especificado. Use o artigo completo como contexto, se necessário.
249
+ **Contexto do Artigo Completo:**
250
+ {article}
251
+ **Resumo do Especialista a Ser Reescrito:**
252
+ {gold_summary}
253
+ """
254
+ }
255
+
256
+ def generate_synthetic_summary(article, gold_summary, band, lang):
257
+ """Call an OpenAI model to generate a synthetic summary for a given readability band and language."""
258
+ prompts_for_lang = ALL_PROMPTS.get(lang)
259
+ user_prompt_template = USER_PROMPT_TEMPLATES.get(lang)
260
+ if not prompts_for_lang or not user_prompt_template:
261
+ raise ValueError(f"No prompts available for language: {lang}")
262
+
263
+ system_prompt = prompts_for_lang[band]
264
+ user_prompt = user_prompt_template.format(article=article, gold_summary=gold_summary)
265
+
266
+ for attempt in range(3):
267
+ try:
268
+ response = client.chat.completions.create(
269
+ model="gpt-4.1-mini",
270
+ messages=[
271
+ {"role": "system", "content": system_prompt},
272
+ {"role": "user", "content": user_prompt}
273
+ ],
274
+ temperature=0.3
275
+ )
276
+ return response.choices[0].message.content.strip()
277
+ except Exception as e:
278
+ print(f"API call failed on attempt {attempt + 1} for band {band}: {e}")
279
+ if attempt < 2:
280
+ time.sleep(5)
281
+ else:
282
+ print(f"Failed to generate summary for band {band} after 3 attempts.")
283
+ return None
284
+
285
+ def build_synthetic_dataset(input_path, output_path, lang, max_samples=None):
286
+ """Generate a synthetic dataset from a JSON file for a specific language."""
287
+ results = []
288
+ processed_articles = set()
289
+ if os.path.exists(output_path):
290
+ with open(output_path, 'r', encoding='utf-8') as f:
291
+ try:
292
+ results = json.load(f)
293
+ processed_articles = {item['article'] for item in results}
294
+ print(f"Loaded {len(results)} existing records from {output_path}.")
295
+ except json.JSONDecodeError:
296
+ print(f"Warning: Could not decode JSON from {output_path}. Starting fresh.")
297
+ results = []
298
+
299
+ with open(input_path, "r", encoding='utf-8') as f:
300
+ data = json.load(f)
301
+
302
+ items_to_process = [item for item in data if item["fulltext"] not in processed_articles]
303
+ print(f"Found {len(items_to_process)} new articles to process.")
304
+
305
+ for item in tqdm.tqdm(items_to_process):
306
+ if max_samples and len(results) >= max_samples:
307
+ print(f"Reached max_samples limit of {max_samples}.")
308
+ break
309
+
310
+ article, gold = item["fulltext"], item["summary"]
311
+
312
+ synthetic_summaries = {}
313
+ all_bands_successful = True
314
+ for band in ["B1", "B2", "B3"]:
315
+ synthetic = generate_synthetic_summary(article, gold, band, lang=lang)
316
+ if synthetic:
317
+ synthetic_summaries[band] = synthetic
318
+ else:
319
+ all_bands_successful = False
320
+ break
321
+
322
+ if all_bands_successful:
323
+ results.append({
324
+ "article": article,
325
+ "gold_summary": gold,
326
+ "synthetic_summary": synthetic_summaries
327
+ })
328
+
329
+ if len(results) % 5 == 0 and len(results) > len(processed_articles):
330
+ print(f"Processed {len(results)} total samples, saving progress...")
331
+ with open(output_path, "w", encoding='utf-8') as f:
332
+ json.dump(results, f, ensure_ascii=False, indent=4)
333
+
334
+ print("Generation complete. Saving final dataset...")
335
+ with open(output_path, "w", encoding='utf-8') as f:
336
+ json.dump(results, f, ensure_ascii=False, indent=4)
337
+ print(f"Dataset saved to {output_path}")
338
+
339
+ # --- Example Usage for English ---
340
+ # To run for English, set lang = "en" and point to your English data file.
341
+ lang = "pt"
342
+ path = f"/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_{lang}.json"
343
+ output_file = f"/home/mshahidul/readctrl/generating_data/{lang}_syntheticV1.json"
344
+ if os.path.exists(output_file):
345
+ temp=output_file.split("/")[-1].replace(".json","")
346
+ output_file = f"/home/mshahidul/readctrl/generating_data/{lang}_syntheticV{int(temp[-1])+1}.json"
347
+
348
+ build_synthetic_dataset(path, output_file, lang=lang, max_samples=100)
code/old/sz_es.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pyphen
3
+
4
+ # --- Basic Spanish text stats ---
5
+ _dic = pyphen.Pyphen(lang='es_ES')
6
+
7
+ _word_re = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)
8
+
9
+ def _tokenize_words(text):
10
+ return _word_re.findall(text)
11
+
12
+ def _count_sentences(text):
13
+ # Split on ., !, ?, and Spanish ¡¿ — keep it simple
14
+ parts = re.split(r"[.!?¡¿]+", text)
15
+ return max(1, sum(1 for p in parts if p.strip()))
16
+
17
+ def _count_syllables_es(word):
18
+ parts = _dic.hyphenate(word)
19
+ return (len(parts) + 1) if parts else 1
20
+
21
+ def _text_stats_es(text):
22
+ words = _tokenize_words(text)
23
+ W = len(words)
24
+ S = _count_sentences(text)
25
+ syl = sum(_count_syllables_es(w) for w in words) if W else 0
26
+ LW = sum(1 for w in words if len(w) > 6) # LIX long words (>6 chars)
27
+ return W, S, syl, LW
28
+
29
+ # --- Szigriszt–Pazos (INFLESZ) ---
30
+ def szigriszt_pazos(text):
31
+ W, S, syl, _ = _text_stats_es(text)
32
+ if W == 0 or S == 0:
33
+ return None
34
+ # Reading ease: higher = easier
35
+ return 206.835 - 62.3 * (syl / W) - (W / S)
36
+
37
+ # --- LIX (language-agnostic) ---
38
+ def lix(text):
39
+ W, S, _, LW = _text_stats_es(text)
40
+ if W == 0 or S == 0:
41
+ return None
42
+ return (W / S) + (100.0 * LW / W)
43
+
44
+ # Example bands (tune to your corpus)
45
+ SZ_BANDS = {
46
+ 'B1': (65, 100), # easy to very easy
47
+ 'B2': (55, 65), # normal
48
+ 'B3': (40, 55), # somewhat hard
49
+ }
50
+
51
+ LIX_BANDS = {
52
+ 'B1': (20, 35), # easier
53
+ 'B2': (35, 45), # mid
54
+ 'B3': (45, 60), # harder
55
+ }
56
+
57
+ def in_band(score, band, bands, delta=0.0):
58
+ if score is None:
59
+ return False
60
+ lo, hi = bands[band]
61
+ return (lo - delta) <= score <= (hi + delta)
62
+
63
+ # Example usage
64
+ text = "Las vacunas salvan millones de vidas cada año. Son seguras y eficaces."
65
+ sz = szigriszt_pazos(text)
66
+ lx = lix(text)
67
+ # print("Szigriszt:", sz, "B1?", in_band(sz, 'B1', SZ_BANDS, delta=2))
68
+ # print("LIX:", lx, "B1?", in_band(lx, 'B1', LIX_BANDS, delta=2))
code/rc.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ parser = argparse.ArgumentParser()
5
+ parser.add_argument("--g", type=str, default="2", help="GPU ID")
6
+ args = parser.parse_args()
7
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
8
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(args.g)
9
+
10
+ import torch
11
+ import time
12
+
13
+ # Set the specific GPU device (change the index if it's not GPU 0; check with nvidia-smi)
14
+ # torch.cuda.set_device(0)
15
+
16
+ # Get total memory in bytes (should be around 85e9 for A100 80GB, but use reported value)
17
+ total_memory = torch.cuda.get_device_properties(0).total_memory
18
+
19
+ # List to hold allocated tensors
20
+ allocated_tensors = []
21
+
22
+ # Chunk size: Allocate in 4GB chunks to avoid fragmentation issues (adjust if needed)
23
+ chunk_size_bytes = 4 * 1024**3 # 4 GiB
24
+ chunk_elements = chunk_size_bytes // torch.tensor([], dtype=torch.float32).element_size()
25
+
26
+ try:
27
+ allocated = 0
28
+ while allocated < total_memory * 0.85: # Allocate up to 95% to leave some headroom
29
+ chunk = torch.empty(chunk_elements, dtype=torch.float32, device='cuda')
30
+ allocated_tensors.append(chunk)
31
+ allocated += chunk_size_bytes
32
+ # Optional: Touch the memory to force allocation
33
+ chunk.zero_()
34
+ torch.cuda.synchronize()
35
+ except RuntimeError as e:
36
+ if 'out of memory' in str(e).lower():
37
+ print(f"Allocated approximately {allocated / (1024**3):.2f} GB. Holding VRAM on A100.")
38
+ else:
39
+ raise e
40
+
41
+ # Hold the memory indefinitely
42
+ print("VRAM occupied. Running forever to hold it.")
43
+ while True:
44
+ time.sleep(3600) # Sleep 1 hour to minimize CPU usage; script will hold until killed
code/readability_final_res_process.ipynb ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "30a7b117",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import json\n",
11
+ "import os\n",
12
+ "\n",
13
+ "# Define the file paths\n",
14
+ "file_paths = [\n",
15
+ " '/home/mshahidul/readctrl/data/reasoning/refined_evaluated_support_0_100_qwen3-32B.json',\n",
16
+ " '/home/mshahidul/readctrl/data/reasoning/refined_evaluated_support_100_200_qwen3-32B.json',\n",
17
+ " '/home/mshahidul/readctrl/data/reasoning/refined_evaluated_support_200_300_qwen3-32B.json'\n",
18
+ "]\n",
19
+ "\n",
20
+ "merged_data = []\n",
21
+ "\n",
22
+ "# Loop through and append data\n",
23
+ "for file_path in file_paths:\n",
24
+ " if os.path.exists(file_path):\n",
25
+ " with open(file_path, 'r', encoding='utf-8') as f:\n",
26
+ " data = json.load(f)\n",
27
+ " # Assuming each file contains a list of objects\n",
28
+ " if isinstance(data, list):\n",
29
+ " merged_data.extend(data)\n",
30
+ " else:\n",
31
+ " merged_data.append(data)\n",
32
+ " print(f\"Successfully loaded: {file_path}\")\n",
33
+ " else:\n",
34
+ " print(f\"Warning: File not found: {file_path}\")\n",
35
+ "\n",
36
+ "# Save the merged result\n",
37
+ "output_path = '/home/mshahidul/readctrl/data/reasoning/refined_evaluated_support_merged_0_300_qwen3-32B.json'\n",
38
+ "with open(output_path, 'w', encoding='utf-8') as f:\n",
39
+ " json.dump(merged_data, f, indent=4)\n",
40
+ "\n",
41
+ "print(f\"\\nTotal records merged: {len(merged_data)}\")\n",
42
+ "print(f\"Merged file saved to: {output_path}\")"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "id": "27ab3270",
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "import json\n",
53
+ "\n",
54
+ "# Define file paths\n",
55
+ "readability_path = '/home/mshahidul/readctrl/data/classified_readability/classified_multiclinsum_test_en.json'\n",
56
+ "reasoning_path = '/home/mshahidul/readctrl/data/reasoning/refined_evaluated_support_merged_0_300_qwen3-32B.json'\n",
57
+ "output_path = '/home/mshahidul/readctrl/data/reasoning/merged_readability_reasoning_en_final.json'\n",
58
+ "\n",
59
+ "# 1. Load the readability data and create a lookup map\n",
60
+ "with open(readability_path, 'r') as f:\n",
61
+ " readability_data = json.load(f)\n",
62
+ "\n",
63
+ "# Create a dictionary for O(1) lookup: {id: score}\n",
64
+ "readability_lookup = {item['id']: item['readability_score'] for item in readability_data}\n",
65
+ "\n",
66
+ "# 2. Load the reasoning data\n",
67
+ "with open(reasoning_path, 'r') as f:\n",
68
+ " reasoning_data = json.load(f)\n",
69
+ "\n",
70
+ "# 3. Merge the scores into the reasoning data\n",
71
+ "merged_count = 0\n",
72
+ "for entry in reasoning_data:\n",
73
+ " entry_id = entry.get('id')\n",
74
+ " if entry_id in readability_lookup:\n",
75
+ " # Add the score to the existing dictionary\n",
76
+ " entry['readability_score'] = readability_lookup[entry_id]\n",
77
+ " merged_count += 1\n",
78
+ " else:\n",
79
+ " # Optional: Handle cases where an ID is missing in the readability file\n",
80
+ " entry['readability_score'] = None\n",
81
+ "\n",
82
+ "# 4. Save the merged result\n",
83
+ "with open(output_path, 'w') as f:\n",
84
+ " json.dump(reasoning_data, f, indent=4)\n",
85
+ "\n",
86
+ "print(f\"Successfully merged {merged_count} records. Saved to {output_path}\")"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 3,
92
+ "id": "2ef2e0b6",
93
+ "metadata": {},
94
+ "outputs": [
95
+ {
96
+ "name": "stdout",
97
+ "output_type": "stream",
98
+ "text": [
99
+ "Threshold set to: 90.0%\n",
100
+ "Successfully saved 192 records to: /home/mshahidul/readctrl/data/final_result/processed_threshold_results.json\n"
101
+ ]
102
+ }
103
+ ],
104
+ "source": [
105
+ "import json\n",
106
+ "import os\n",
107
+ "\n",
108
+ "# Configuration\n",
109
+ "input_file = '/home/mshahidul/readctrl/data/reasoning/merged_readability_reasoning_en_final.json'\n",
110
+ "output_dir = '/home/mshahidul/readctrl/data/final_result'\n",
111
+ "output_filename = 'processed_threshold_results.json'\n",
112
+ "\n",
113
+ "# Set your threshold here (e.g., 0.90 for 90%, 0.85 for 85%)\n",
114
+ "SUPPORT_THRESHOLD = 0.90 \n",
115
+ "\n",
116
+ "def process_with_threshold(threshold):\n",
117
+ " # Ensure the output folder exists\n",
118
+ " if not os.path.exists(output_dir):\n",
119
+ " os.makedirs(output_dir)\n",
120
+ "\n",
121
+ " # Load the source data\n",
122
+ " try:\n",
123
+ " with open(input_file, 'r') as f:\n",
124
+ " data = json.load(f)\n",
125
+ " except FileNotFoundError:\n",
126
+ " print(f\"Error: Source file not found at {input_file}\")\n",
127
+ " return\n",
128
+ "\n",
129
+ " final_output = []\n",
130
+ "\n",
131
+ " for item in data:\n",
132
+ " evals = item.get('subclaim_evaluations', [])\n",
133
+ " \n",
134
+ " if not evals:\n",
135
+ " continue # Skip items with no subclaims to evaluate\n",
136
+ " \n",
137
+ " # Calculate the percentage of supported subclaims\n",
138
+ " supported_count = sum(1 for sub in evals if sub.get('support_label') == 'supported')\n",
139
+ " support_ratio = supported_count / len(evals)\n",
140
+ " \n",
141
+ " # Keep if it meets the threshold (e.g., 0.90)\n",
142
+ " if support_ratio >= threshold:\n",
143
+ " clean_item = item.copy()\n",
144
+ " \n",
145
+ " # Map readability_score to difficulty\n",
146
+ " score = clean_item.get('readability_score', 0)\n",
147
+ " if score >= 4:\n",
148
+ " clean_item['difficulty'] = 'easy'\n",
149
+ " elif score == 3:\n",
150
+ " clean_item['difficulty'] = 'medium'\n",
151
+ " else:\n",
152
+ " clean_item['difficulty'] = 'hard'\n",
153
+ " \n",
154
+ " # Add metadata about the support ratio for transparency\n",
155
+ " clean_item['support_percentage'] = round(support_ratio * 100, 2)\n",
156
+ " \n",
157
+ " # Remove the subclaim_evaluations field\n",
158
+ " if 'subclaim_evaluations' in clean_item:\n",
159
+ " del clean_item['subclaim_evaluations']\n",
160
+ " \n",
161
+ " final_output.append(clean_item)\n",
162
+ "\n",
163
+ " # Save to a single JSON file\n",
164
+ " target_path = os.path.join(output_dir, output_filename)\n",
165
+ " with open(target_path, 'w', encoding='utf-8') as out_f:\n",
166
+ " json.dump(final_output, out_f, indent=4, ensure_ascii=False)\n",
167
+ " \n",
168
+ " print(f\"Threshold set to: {threshold * 100}%\")\n",
169
+ " print(f\"Successfully saved {len(final_output)} records to: {target_path}\")\n",
170
+ "\n",
171
+ "if __name__ == \"__main__\":\n",
172
+ " process_with_threshold(SUPPORT_THRESHOLD)"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 4,
178
+ "id": "295a4a2a",
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "name": "stdout",
183
+ "output_type": "stream",
184
+ "text": [
185
+ "Success! Merged data saved to: /home/mshahidul/readctrl/data/factual_testing/merged_evaluated_support_0_300.json\n"
186
+ ]
187
+ }
188
+ ],
189
+ "source": [
190
+ "import json\n",
191
+ "import os\n",
192
+ "\n",
193
+ "# List of file paths to merge\n",
194
+ "file_paths = [\n",
195
+ " '/home/mshahidul/readctrl/data/factual_testing/evaluated_support_0_100_qwen3-32B.json',\n",
196
+ " '/home/mshahidul/readctrl/data/factual_testing/evaluated_support_100_200_qwen3-32B.json',\n",
197
+ " '/home/mshahidul/readctrl/data/factual_testing/evaluated_support_200_300_qwen3-32B.json'\n",
198
+ "]\n",
199
+ "\n",
200
+ "merged_data = []\n",
201
+ "\n",
202
+ "# Iterate through each file and append its contents to the list\n",
203
+ "for file_path in file_paths:\n",
204
+ " if os.path.exists(file_path):\n",
205
+ " with open(file_path, 'r', encoding='utf-8') as f:\n",
206
+ " data = json.load(f)\n",
207
+ " # If the JSON is a list, extend the merged list\n",
208
+ " if isinstance(data, list):\n",
209
+ " merged_data.extend(data)\n",
210
+ " # If the JSON is a single dictionary, append it\n",
211
+ " else:\n",
212
+ " merged_data.append(data)\n",
213
+ " else:\n",
214
+ " print(f\"Warning: File not found - {file_path}\")\n",
215
+ "\n",
216
+ "# Save the combined data to a new file\n",
217
+ "output_file = '/home/mshahidul/readctrl/data/factual_testing/merged_evaluated_support_0_300.json'\n",
218
+ "\n",
219
+ "with open(output_file, 'w', encoding='utf-8') as f:\n",
220
+ " json.dump(merged_data, f, indent=4)\n",
221
+ "\n",
222
+ "print(f\"Success! Merged data saved to: {output_file}\")"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": 8,
228
+ "id": "e7ba1534",
229
+ "metadata": {},
230
+ "outputs": [
231
+ {
232
+ "name": "stdout",
233
+ "output_type": "stream",
234
+ "text": [
235
+ "Updating scores for 100 documents...\n",
236
+ "Successfully updated scores for 100 documents.\n",
237
+ "File saved to: /home/mshahidul/readctrl/data/reasoning/updated_scores/refined_v2_full_evaluation_200_300_qwen3-32B.json\n"
238
+ ]
239
+ }
240
+ ],
241
+ "source": [
242
+ "import json\n",
243
+ "import argparse\n",
244
+ "import os\n",
245
+ "\n",
246
+ "def calculate_scores(data):\n",
247
+ " \"\"\"\n",
248
+ " Recalculates factual_attribution and completeness scores based on \n",
249
+ " the updated labels in attribution_details and completeness_details.\n",
250
+ " \"\"\"\n",
251
+ " updated_count = 0\n",
252
+ "\n",
253
+ " for doc in data:\n",
254
+ " # 1. Recalculate Factual Attribution Score\n",
255
+ " attribution_list = doc.get('attribution_details', [])\n",
256
+ " if attribution_list:\n",
257
+ " supported_attr = sum(1 for item in attribution_list if item.get('label') == 'supported')\n",
258
+ " doc['scores']['factual_attribution'] = supported_attr / len(attribution_list)\n",
259
+ " else:\n",
260
+ " doc['scores']['factual_attribution'] = 0.0\n",
261
+ "\n",
262
+ " # 2. Recalculate Completeness Score\n",
263
+ " completeness_list = doc.get('completeness_details', [])\n",
264
+ " if completeness_list:\n",
265
+ " supported_comp = sum(1 for item in completeness_list if item.get('present_in_summary') == 'supported')\n",
266
+ " doc['scores']['completeness'] = supported_comp / len(completeness_list)\n",
267
+ " else:\n",
268
+ " doc['scores']['completeness'] = 0.0\n",
269
+ " \n",
270
+ " updated_count += 1\n",
271
+ "\n",
272
+ " return data, updated_count\n",
273
+ "\n",
274
+ "if __name__ == \"__main__\":\n",
275
+ " # parser = argparse.ArgumentParser(description=\"Update scores in refined clinical evaluation JSON.\")\n",
276
+ " # parser.add_argument(\"--input_file\", type=str, required=True, help=\"Path to the refined JSON file.\")\n",
277
+ " # parser.add_argument(\"--output_file\", type=str, help=\"Path to save the updated JSON. If omitted, overwrites input.\")\n",
278
+ " # args = parser.parse_args()\n",
279
+ " input_file = '/home/mshahidul/readctrl/data/reasoning/refined_v2_full_evaluation_200_300_qwen3-32B.json'\n",
280
+ " output_path = \"/home/mshahidul/readctrl/data/reasoning/updated_scores\"\n",
281
+ " output_file = os.path.join(output_path, os.path.basename(input_file))\n",
282
+ " # Load data\n",
283
+ " with open(input_file, 'r') as f:\n",
284
+ " data = json.load(f)\n",
285
+ "\n",
286
+ " print(f\"Updating scores for {len(data)} documents...\")\n",
287
+ " \n",
288
+ " # Process\n",
289
+ " updated_data, count = calculate_scores(data)\n",
290
+ "\n",
291
+ " \n",
292
+ " \n",
293
+ " # Save results\n",
294
+ " with open(output_file, 'w') as f:\n",
295
+ " json.dump(updated_data, f, indent=2, ensure_ascii=False)\n",
296
+ "\n",
297
+ " print(f\"Successfully updated scores for {count} documents.\")\n",
298
+ " print(f\"File saved to: {output_file}\")"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": 12,
304
+ "id": "612109dc",
305
+ "metadata": {},
306
+ "outputs": [
307
+ {
308
+ "name": "stdout",
309
+ "output_type": "stream",
310
+ "text": [
311
+ "dict_keys(['index', 'id', 'fulltext', 'fulltext_subclaims', 'summary', 'summary_subclaims', 'diff_label_texts', 'diff_label_subclaims', 'readability_score'])\n",
312
+ "dict_keys(['low_health_literacy', 'intermediate_health_literacy', 'proficient_health_literacy'])\n",
313
+ "dict_keys(['low_health_literacy', 'intermediate_health_literacy', 'proficient_health_literacy'])\n"
314
+ ]
315
+ }
316
+ ],
317
+ "source": [
318
+ "# /home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json\n",
319
+ "import json\n",
320
+ "with open('/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json', 'r') as f:\n",
321
+ " anno_data = json.load(f)\n",
322
+ "print(anno_data[0].keys())\n",
323
+ "print(anno_data[0]['diff_label_texts'].keys())\n",
324
+ "print(anno_data[0]['diff_label_subclaims'].keys())"
325
+ ]
326
+ }
327
+ ],
328
+ "metadata": {
329
+ "kernelspec": {
330
+ "display_name": "un",
331
+ "language": "python",
332
+ "name": "python3"
333
+ },
334
+ "language_info": {
335
+ "codemirror_mode": {
336
+ "name": "ipython",
337
+ "version": 3
338
+ },
339
+ "file_extension": ".py",
340
+ "mimetype": "text/x-python",
341
+ "name": "python",
342
+ "nbconvert_exporter": "python",
343
+ "pygments_lexer": "ipython3",
344
+ "version": "3.11.14"
345
+ }
346
+ },
347
+ "nbformat": 4,
348
+ "nbformat_minor": 5
349
+ }
code/test.ipynb ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "25745a03",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "# /home/mshahidul/readctrl/data/translated_data/translation_wo_judge/multiclinsum_gs_train_en2bn_gemma(0_200).json\n",
11
+ "import json\n",
12
+ "with open(\"/home/mshahidul/readctrl/data/translated_data/translation_wo_judge/multiclinsum_gs_train_en2bn_gemma(0_200).json\", \"r\") as f:\n",
13
+ " data = json.load(f)\n",
14
+ "\n",
15
+ "for item in data:\n",
16
+ " \n",
17
+ "\n",
18
+ "\n"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 6,
24
+ "id": "a170a10b",
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "data": {
29
+ "text/plain": [
30
+ "'14-year-old previously healthy adolescent who presented to the Primary Emergency Care Service (PEC) of Osorno with a 11-day history of a predominantly nocturnal irritative cough. Symptomatic treatment was indicated, evolving with dyspnoea and orthopnoea. He presented to the Emergency Department of the Osorno Base Hospital (OBH), with severe respiratory distress, intolerance to supine position, and abdominal pain. He was admitted to the Paediatric Intensive Care Unit (PICU), tachycardic, hypertensive, polypneic, oxygen saturation 96% with FiO2 35%, rosy, hydrated and well perfused, with flat jugular veins, small bilateral supraclavicular lymphadenopathies. The thorax was without retraction of soft tissue, maintained in a genupectoral position, with decreased pulmonary murmurs in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The soft abdomen was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and the cardiac auscultation had muffled tones, without breath sounds. The abdominal soft tissue was not easily depressible and sensitive in both hypochondria, with doubtful visceral enlargements and no injuries. The chest radiograph showed a superior mediastinal mass and atelectasis of the right middle lobe associated with ipsilateral pleural effusion. Contrast-enhanced chest X-ray was not performed due to contraindication of anaesthesia, as stated in the summary of transfer from OBH. He was transferred in a serious condition to the PICU HBV, with a Mediastinal Compression Syndrome, with clinical suspicion of non-Hodgkin lymphoma. He was evaluated by the paediatric haemato-oncology, paediatric surgery, paediatric intensive care, imaging, radiotherapy and paediatric oncology teams, with a normal pulmonary murmur in both bases, and\\n\\nA nephrological evaluation was performed, which confirmed renal failure secondary to tumor lysis syndrome, without dialysis urgency and tendency to hypertension, with creatinine 1.54 mg/dL, phosphemia 11 mg/dL, without hypernatremia. It continued with hyperhydration, diuretic (furosemide) and antihypertensive (amlodipine). From the respiratory point of view, it presented oxygen requirement, with FIO2 35% by mask of Venturi, suspending this supply on the third day of admission. It evolved with episodes of psychomotor agitation, associated to the diagnosis in process, which was treated according to the institutional protocol of psychomotor agitation, with psychological and psychiatric support, with satisfactory evolution. On the third day of admission and treatment a CT scan of the thorax, abdomen and pelvis was performed with contrast, observing an increase in the size of the thymus, of homogeneous aspect, probably in the context of a lymphoproliferative process and findings suggestive of pulmonary thromboembolism. The angioCT of the thorax showed thrombosis of the jugular vein, extensive bilateral pleural effusion associated to atelectatic phenomena in both bases, with signs of medical bilateral nephrosis. Anticoagulation with enoxaparin (1 mg/kg dose, every 12 hours) was indicated for twenty days. Then the angioCT of control showed resolution of the thrombosis.On the fourth day of admission and treatment, a diagnostic and extension study was performed, which included, among others, a complete biochemical profile including lipid profile, granulopoietic hyperplasia of the bone marrow (myelogram), flow cytometry (bone marrow) in which no cells with a predominant clonal or neoplastic immunophenotype of haemological lineage were observed, flow cytometry in peripheral blood negative for neoplastic cells, cytological of pleural fluid negative for neoplastic cells, flow cytometry of pleural fluid without evidence of haemological neoplasia. It was presented to the paediatric oncological committee, highlighting that it was not possible to take a biopsy of the tumour given that the mediastinal mass disappeared with the cytoreductive treatment, assuming the diagnosis of lymphoblastic lymphoma by the clinical picture and the response to treatment, according to the PINDA 0516 protocol. This protocol contemplates in Induction IA eight doses of Lasp E. coli of 10,000 IU/m2. Having received seven doses of L-asp and with a cumulative dose of ninety thousand international units plus glucocorticoid (prednisone), presented a picture of decline, vomiting, abdominal pain and mild dehydration. There was suspicion of pancreatitis, which was ruled out by normal amylase/lipase values and normal hepatic tests. At that time it had plasma electrolyte profile with hyponatraemia of 126 mOsm/kg and urinary osmolality of 510 mOsm/kg, both normal values. With hyponatraemia and hypertriglyceridaemia, there was suspicion of RAM of pseudohyponatraemia secondary to hypertriglyceridaemia associated to L-asp. It was evaluated by Gastroenterology and Endocrinology, indicating a diet low in refined sugars and rich in fiber, fibrates (ciprofibrato 100 mg oral daily) and omega 3 (4 g oral daily), until triglyceride values of 300 mg/dL were achieved. Two weeks later the triglycerides had a value of 79 mg/dL. Ciprofibrato and omega3 were suspended, indicating prophylactic use associated to corticoid and L-asp treatment. A total of twelve doses of L-asp were completed with a cumulative dose of one hundred and eighty four thousand international units corresponding to the induction protocol. The suspicion of RAM was subjected to causality evaluation, with the modified Karch and Lasagna algorithm by WHO5, which resulted in “Definitive” RAM for the association of L-asp and Prednisone\\n'"
31
+ ]
32
+ },
33
+ "execution_count": 6,
34
+ "metadata": {},
35
+ "output_type": "execute_result"
36
+ }
37
+ ],
38
+ "source": [
39
+ "txt"
40
+ ]
41
+ }
42
+ ],
43
+ "metadata": {
44
+ "kernelspec": {
45
+ "display_name": "unsloth",
46
+ "language": "python",
47
+ "name": "python3"
48
+ },
49
+ "language_info": {
50
+ "codemirror_mode": {
51
+ "name": "ipython",
52
+ "version": 3
53
+ },
54
+ "file_extension": ".py",
55
+ "mimetype": "text/x-python",
56
+ "name": "python",
57
+ "nbconvert_exporter": "python",
58
+ "pygments_lexer": "ipython3",
59
+ "version": "3.11.11"
60
+ }
61
+ },
62
+ "nbformat": 4,
63
+ "nbformat_minor": 5
64
+ }
code/text_classifier/dspy.ipynb ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "8a9d70f0",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import dspy\n",
11
+ "import json\n",
12
+ "from typing import Literal\n",
13
+ "from dspy.teleprompt import BootstrapFewShotWithRandomSearch\n",
14
+ "from dspy.evaluate import Evaluate\n",
15
+ "\n",
16
+ "# --- 1. LLM Configuration ---\n",
17
+ "api_file = \"/home/mshahidul/api_new.json\"\n",
18
+ "with open(api_file, \"r\") as f:\n",
19
+ " api_keys = json.load(f)\n",
20
+ "openai_api_key = api_keys[\"openai\"]\n",
21
+ "\n",
22
+ "# Student: Local vLLM (Deployment Model)\n",
23
+ "vllm_model = dspy.LM(\n",
24
+ " model='Qwen/Qwen3-30B-A3B-Instruct-2507',\n",
25
+ " api_base=\"http://172.16.34.29:8030/v1\",\n",
26
+ " api_key=\"EMPTY\",\n",
27
+ " temperature=0.0\n",
28
+ ")\n",
29
+ "\n",
30
+ "# Teacher: OpenAI (High-quality rationale generation)\n",
31
+ "# Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')\n",
32
+ "openai_model_teacher = dspy.LM(model='gpt-5', api_key=openai_api_key)\n",
33
+ "openai_model_student = dspy.LM(model='gpt-5-mini', api_key=openai_api_key)\n",
34
+ "\n",
35
+ "# Default LM for DSPy runtime\n",
36
+ "# Use the local vLLM for fast iteration; switch to openai_model_student if needed.\n",
37
+ "# dspy.configure(lm=vllm_model)\n",
38
+ "dspy.configure(lm=openai_model_student)"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "id": "0f350ef4",
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "class HealthLiteracySignature(dspy.Signature):\n",
49
+ " \"\"\"\n",
50
+ " Classify the health literacy level of a generated text \n",
51
+ " based on the original full source text.\n",
52
+ " \"\"\"\n",
53
+ " full_text = dspy.InputField(desc=\"The original clinical or source medical text.\")\n",
54
+ " generated_text = dspy.InputField(desc=\"The rewritten medical text to classify for health literacy based on the original source text.\")\n",
55
+ " \n",
56
+ " # Using Literal ensures the output is constrained to your three categories\n",
57
+ " literacy_label = dspy.OutputField(desc=\"One of: low_health_literacy, intermediate_health_literacy, proficient_health_literacy\")"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": null,
63
+ "id": "e369f8e8",
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": [
67
+ "class HealthLiteracyClassifier(dspy.Module):\n",
68
+ " def __init__(self):\n",
69
+ " super().__init__()\n",
70
+ " # Use ChainOfThought for better reasoning on medical jargon\n",
71
+ " self.classifier = dspy.ChainOfThought(HealthLiteracySignature)\n",
72
+ "\n",
73
+ " def forward(self, full_text, generated_text):\n",
74
+ " return self.classifier(full_text=full_text, generated_text=generated_text)"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": null,
80
+ "id": "055542d5",
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "def prepare_data(raw_data):\n",
85
+ " dataset = []\n",
86
+ " for item in raw_data:\n",
87
+ " example = dspy.Example(\n",
88
+ " full_text=item['fulltext'],\n",
89
+ " generated_text=item['diff_label_texts'],\n",
90
+ " literacy_label=item['label'] # Matches the Signature field\n",
91
+ " ).with_inputs('full_text', 'generated_text')\n",
92
+ " dataset.append(example)\n",
93
+ " return dataset[:100], dataset[100:]"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": null,
99
+ "id": "e570be47",
100
+ "metadata": {},
101
+ "outputs": [],
102
+ "source": [
103
+ "import json\n",
104
+ "path = \"/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json\"\n",
105
+ "raw_data = json.load(open(path))\n",
106
+ "trainset, testset = prepare_data(raw_data)"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": null,
112
+ "id": "39e90da8",
113
+ "metadata": {},
114
+ "outputs": [],
115
+ "source": [
116
+ "def health_literacy_metric(gold, pred, trace=None):\n",
117
+ " # Use 'literacy_label' because that is what's in your Signature\n",
118
+ " if not pred or not hasattr(pred, 'literacy_label'):\n",
119
+ " return False\n",
120
+ " \n",
121
+ " # Standardize both for comparison\n",
122
+ " gold_label = str(gold.literacy_label).strip().lower()\n",
123
+ " pred_label = str(pred.literacy_label).strip().lower()\n",
124
+ " \n",
125
+ " return gold_label == pred_label\n",
126
+ "\n",
127
+ "optimizer = BootstrapFewShotWithRandomSearch(\n",
128
+ " metric=health_literacy_metric,\n",
129
+ " max_bootstrapped_demos=3,\n",
130
+ " num_candidate_programs=8, \n",
131
+ " teacher_settings=dict(lm=openai_model_teacher)\n",
132
+ ")\n",
133
+ "\n",
134
+ "# 3. Compile! This creates the \"optimized prompt\"\n",
135
+ "compiled_classifier = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)\n",
136
+ "\n",
137
+ "evaluator = Evaluate(devset=testset, metric=health_literacy_metric, num_threads=1, display_progress=True)\n",
138
+ "accuracy_score = evaluator(compiled_classifier)\n",
139
+ "compiled_classifier.save(\"health_literacy_model.json\")"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "markdown",
144
+ "id": "425291ff",
145
+ "metadata": {},
146
+ "source": [
147
+ "## "
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": 9,
153
+ "id": "f8ae33e8",
154
+ "metadata": {},
155
+ "outputs": [
156
+ {
157
+ "name": "stdout",
158
+ "output_type": "stream",
159
+ "text": [
160
+ "vllm-gpt-oss-20b_teacher-gpt5_v1\n",
161
+ "{'accuracy_score': 78.57, 'num_results': 84}\n",
162
+ "vllm-gemma-3-12b-it_teacher-gpt5_v1\n",
163
+ "{'accuracy_score': 79.76, 'num_results': 84}\n",
164
+ "vllm-Qwen2.5-7B-Instruct_teacher-gpt5_v1\n",
165
+ "{'accuracy_score': 59.52, 'num_results': 84}\n",
166
+ "student-gpt5-mini_teacher-gpt5_(fulltxt+gen_sum)\n",
167
+ "{'score': 88.1, 'results': 84}\n",
168
+ "vllm-Meta-Llama-3.1-8B-Instruct_teacher-gpt5_v1\n",
169
+ "{'accuracy_score': 78.57, 'num_results': 84}\n",
170
+ "vllm-phi-4_teacher-gpt5_v1\n",
171
+ "{'accuracy_score': 73.81, 'num_results': 84}\n",
172
+ "vllm-qwen3-8b_teacher-gpt5_v1\n",
173
+ "{'accuracy_score': 73.81, 'num_results': 84}\n",
174
+ "student-gpt5-mini_teacher-gpt5_v1\n",
175
+ "{'accuracy_score': 78.57, 'num_results': 84}\n"
176
+ ]
177
+ }
178
+ ],
179
+ "source": [
180
+ "# /home/mshahidul/readctrl/code/text_classifier/dspy_model\n",
181
+ "import os,json\n",
182
+ "folders = os.listdir(\"/home/mshahidul/readctrl/code/text_classifier/dspy_model\")\n",
183
+ "for folder in folders:\n",
184
+ " if os.path.isdir(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}\"):\n",
185
+ " files = os.listdir(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}\")\n",
186
+ " for file in files:\n",
187
+ " if file.endswith(\"accuracy.json\"):\n",
188
+ " path=(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}/{file}\")\n",
189
+ " print(path.split(\"/\")[-2])\n",
190
+ " data = json.load(open(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}/{file}\"))\n",
191
+ " print(data)\n"
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": null,
197
+ "id": "4c236110",
198
+ "metadata": {},
199
+ "outputs": [],
200
+ "source": []
201
+ }
202
+ ],
203
+ "metadata": {
204
+ "kernelspec": {
205
+ "display_name": "unsloth",
206
+ "language": "python",
207
+ "name": "python3"
208
+ },
209
+ "language_info": {
210
+ "codemirror_mode": {
211
+ "name": "ipython",
212
+ "version": 3
213
+ },
214
+ "file_extension": ".py",
215
+ "mimetype": "text/x-python",
216
+ "name": "python",
217
+ "nbconvert_exporter": "python",
218
+ "pygments_lexer": "ipython3",
219
+ "version": "3.11.11"
220
+ }
221
+ },
222
+ "nbformat": 4,
223
+ "nbformat_minor": 5
224
+ }
code/text_classifier/qwen3_(4b)_instruct.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
6
+ os.environ["CUDA_VISIBLE_DEVICES"] = "2"
7
+ from datasets import load_dataset
8
+ from unsloth import FastLanguageModel
9
+ from trl import SFTConfig, SFTTrainer
10
+
11
+ from unsloth.chat_templates import get_chat_template, train_on_responses_only
12
+
13
+ MODEL_NAME = "unsloth/Qwen3-8B"
14
+ DATA_PATH = "verified_combined_0-80.json"
15
+ TEST_DATA_PATH = "verified_combined_0-80_test.json"
16
+ MAX_SEQ_LENGTH = 4096
17
+ FP16_SAVE_DIR = "/home/mshahidul/readctrl_model/full_model/classifier_model"
18
+ TEST_SPLIT_RATIO = 0.1
19
+ SPLIT_SEED = 3407
20
+
21
+ SYSTEM_PROMPT = (
22
+ "You are an expert medical editor and Health Literacy specialist. "
23
+ "Classify the health literacy level of the provided text."
24
+ )
25
+
26
+ USER_PROMPT = """Classify the health literacy level of the rewritten text.
27
+
28
+ Labels:
29
+ - low_health_literacy: very simple, living-room language, minimal jargon.
30
+ - intermediate_health_literacy: standard public-friendly language, limited jargon.
31
+ - proficient_health_literacy: technical, clinical, or academic language.
32
+
33
+ Input:
34
+ Full Source Text:
35
+ <<<FULLTEXT>>>
36
+
37
+ Rewritten Text:
38
+ <<<DIFF_LABEL_TEXTS>>>
39
+
40
+ Output: Return only one label string from the list above."""
41
+
42
+
43
+ def build_messages(fulltext: str, diff_label_texts: str, label: str):
44
+ user_content = USER_PROMPT.replace("<<<FULLTEXT>>>", fulltext).replace(
45
+ "<<<DIFF_LABEL_TEXTS>>>", diff_label_texts
46
+ )
47
+ return [
48
+ {"role": "system", "content": SYSTEM_PROMPT},
49
+ {"role": "user", "content": user_content},
50
+ {"role": "assistant", "content": label},
51
+ ]
52
+
53
+
54
+ def main():
55
+ model, tokenizer = FastLanguageModel.from_pretrained(
56
+ model_name=MODEL_NAME,
57
+ max_seq_length=MAX_SEQ_LENGTH,
58
+ load_in_4bit=False,
59
+ load_in_8bit=False,
60
+ full_finetuning=False,
61
+ )
62
+
63
+ model = FastLanguageModel.get_peft_model(
64
+ model,
65
+ r=32,
66
+ target_modules=[
67
+ "q_proj",
68
+ "k_proj",
69
+ "v_proj",
70
+ "o_proj",
71
+ "gate_proj",
72
+ "up_proj",
73
+ "down_proj",
74
+ ],
75
+ lora_alpha=32,
76
+ lora_dropout=0,
77
+ bias="none",
78
+ use_gradient_checkpointing="unsloth",
79
+ random_state=3407,
80
+ use_rslora=False,
81
+ loftq_config=None,
82
+ )
83
+
84
+ tokenizer = get_chat_template(tokenizer, chat_template="qwen3-instruct")
85
+ dataset = load_dataset("json", data_files=DATA_PATH, split="train")
86
+ split = dataset.train_test_split(test_size=TEST_SPLIT_RATIO, seed=SPLIT_SEED)
87
+ train_dataset = split["train"]
88
+ test_dataset = split["test"]
89
+ test_dataset.to_json(TEST_DATA_PATH)
90
+
91
+ def formatting_prompts_func(examples):
92
+ texts = []
93
+ for fulltext, diff_label_texts, label in zip(
94
+ examples["fulltext"],
95
+ examples["diff_label_texts"],
96
+ examples["label"],
97
+ ):
98
+ messages = build_messages(fulltext, diff_label_texts, label)
99
+ text = tokenizer.apply_chat_template(
100
+ messages, tokenize=False, add_generation_prompt=False
101
+ )
102
+ texts.append(text)
103
+ return {"text": texts}
104
+
105
+ train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
106
+
107
+ trainer = SFTTrainer(
108
+ model=model,
109
+ processing_class=tokenizer,
110
+ train_dataset=train_dataset,
111
+ eval_dataset=None,
112
+ args=SFTConfig(
113
+ dataset_text_field="text",
114
+ per_device_train_batch_size=64,
115
+ gradient_accumulation_steps=16,
116
+ warmup_steps=5,
117
+ # max_steps=60,
118
+ num_train_epochs=1,
119
+ learning_rate=2e-4,
120
+ logging_steps=1,
121
+ optim="adamw_8bit",
122
+ weight_decay=0.001,
123
+ lr_scheduler_type="linear",
124
+ seed=3407,
125
+ report_to="none",
126
+ ),
127
+ )
128
+
129
+ trainer = train_on_responses_only(
130
+ trainer,
131
+ instruction_part="<|im_start|>user\n",
132
+ response_part="<|im_start|>assistant\n",
133
+ )
134
+
135
+ trainer.train()
136
+
137
+ os.makedirs(FP16_SAVE_DIR, exist_ok=True)
138
+ model.save_pretrained_merged(
139
+ FP16_SAVE_DIR,
140
+ tokenizer,
141
+ save_method="merged_16bit",
142
+ )
143
+
144
+
145
+ if __name__ == "__main__":
146
+ main()
code/text_classifier/test_saved_dspy_vllm_gen_text_only.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import traceback
5
+ import urllib.error
6
+ import urllib.request
7
+
8
+ import dspy
9
+ from dspy.evaluate import Evaluate
10
+
11
+
12
+ DEFAULT_API_BASE = "http://172.16.34.22:8040/v1"
13
+ DEFAULT_MODEL_PATH = (
14
+ "/home/mshahidul/readctrl/code/text_classifier/dspy_model/vllm-Meta-Llama-3.1-8B-Instruct_teacher-gpt5_v1/model.json"
15
+ )
16
+ DEFAULT_TEST_PATH = "/home/mshahidul/readctrl/code/text_classifier/data/verified_combined_0-80_clean200.json"
17
+ DEFAULT_OUTPUT_PATH = (
18
+ "/home/mshahidul/readctrl/code/text_classifier/accuracy/"
19
+ "vllm-llama-3.1-8b-awq-int4_teacher-gpt5_v1_clean200_eval.json"
20
+ )
21
+
22
+
23
+ class HealthLiteracySignature(dspy.Signature):
24
+ generated_text = dspy.InputField(
25
+ desc="A version of the source text rewritten for a specific audience."
26
+ )
27
+ literacy_label = dspy.OutputField(
28
+ desc=(
29
+ "Classification: low_health_literacy (simple words, no jargon), "
30
+ "intermediate_health_literacy (moderate technicality), or "
31
+ "proficient_health_literacy (highly technical/original level)."
32
+ )
33
+ )
34
+
35
+
36
+ class HealthLiteracyClassifier(dspy.Module):
37
+ def __init__(self):
38
+ super().__init__()
39
+ self.classifier = dspy.ChainOfThought(HealthLiteracySignature)
40
+
41
+ def forward(self, generated_text):
42
+ return self.classifier(generated_text=generated_text)
43
+
44
+
45
+ def parse_args():
46
+ parser = argparse.ArgumentParser(
47
+ description="Load a saved DSPy model and evaluate on test set."
48
+ )
49
+ parser.add_argument("--model-path", default=DEFAULT_MODEL_PATH)
50
+ parser.add_argument("--test-path", default=DEFAULT_TEST_PATH)
51
+ parser.add_argument(
52
+ "--api-base",
53
+ default=os.environ.get("VLLM_API_BASE", DEFAULT_API_BASE),
54
+ )
55
+ parser.add_argument("--num-threads", type=int, default=1)
56
+ parser.add_argument("--output-path", default=DEFAULT_OUTPUT_PATH)
57
+ parser.add_argument(
58
+ "--provide-traceback",
59
+ action="store_true",
60
+ help="Print full traceback if runtime error happens.",
61
+ )
62
+ return parser.parse_args()
63
+
64
+
65
+ def check_api_base(api_base):
66
+ models_url = api_base.rstrip("/") + "/models"
67
+ req = urllib.request.Request(models_url, method="GET")
68
+ try:
69
+ with urllib.request.urlopen(req, timeout=5) as resp:
70
+ if resp.status >= 400:
71
+ raise RuntimeError(
72
+ f"Endpoint reachable but unhealthy: {models_url} (status={resp.status})"
73
+ )
74
+ except urllib.error.URLError as exc:
75
+ raise ConnectionError(
76
+ "Cannot reach OpenAI-compatible endpoint. "
77
+ f"api_base={api_base}. "
78
+ "Start your vLLM server or pass correct --api-base."
79
+ ) from exc
80
+
81
+
82
+ def load_testset(path):
83
+ examples = []
84
+ if path.endswith(".jsonl"):
85
+ with open(path, "r") as f:
86
+ for line in f:
87
+ if not line.strip():
88
+ continue
89
+ record = json.loads(line)
90
+ example = dspy.Example(
91
+ generated_text=record["generated_text"],
92
+ literacy_label=record["literacy_label"],
93
+ ).with_inputs("generated_text")
94
+ examples.append(example)
95
+ else:
96
+ with open(path, "r") as f:
97
+ records = json.load(f)
98
+ for record in records:
99
+ text = record.get("generated_text", record.get("diff_label_texts"))
100
+ label = record.get("literacy_label", record.get("label"))
101
+ if not text or not label:
102
+ continue
103
+ example = dspy.Example(
104
+ generated_text=text,
105
+ literacy_label=label,
106
+ ).with_inputs("generated_text")
107
+ examples.append(example)
108
+ return examples
109
+
110
+
111
+ def health_literacy_metric(gold, pred, trace=None):
112
+ if not pred or not hasattr(pred, "literacy_label"):
113
+ return False
114
+ gold_label = str(gold.literacy_label).strip().lower()
115
+ pred_label = str(pred.literacy_label).strip().lower()
116
+ return gold_label in pred_label
117
+
118
+
119
+ def load_compiled_classifier(path):
120
+ if hasattr(dspy, "load"):
121
+ try:
122
+ return dspy.load(path)
123
+ except Exception as exc:
124
+ print(
125
+ f"[warning] dspy.load failed ({type(exc).__name__}); "
126
+ "trying module.load(...)"
127
+ )
128
+
129
+ classifier = HealthLiteracyClassifier()
130
+ try:
131
+ classifier.load(path)
132
+ except Exception as exc:
133
+ raise RuntimeError(f"Failed to load compiled model from {path}") from exc
134
+ return classifier
135
+
136
+
137
+ def main():
138
+ args = parse_args()
139
+
140
+ if not os.path.exists(args.model_path):
141
+ raise FileNotFoundError(f"Model file not found: {args.model_path}")
142
+ if not os.path.exists(args.test_path):
143
+ raise FileNotFoundError(f"Test file not found: {args.test_path}")
144
+
145
+ try:
146
+ check_api_base(args.api_base)
147
+
148
+ lm = dspy.LM(
149
+ model="openai/dspy",
150
+ api_base=args.api_base,
151
+ api_key="EMPTY",
152
+ temperature=0.0,
153
+ )
154
+ dspy.configure(lm=lm)
155
+
156
+ testset = load_testset(args.test_path)
157
+ compiled_classifier = load_compiled_classifier(args.model_path)
158
+
159
+ evaluator = Evaluate(
160
+ devset=testset,
161
+ metric=health_literacy_metric,
162
+ num_threads=args.num_threads,
163
+ display_progress=True,
164
+ )
165
+ evaluation_result = evaluator(compiled_classifier)
166
+ accuracy_score = (
167
+ float(evaluation_result.score)
168
+ if hasattr(evaluation_result, "score")
169
+ else float(evaluation_result)
170
+ )
171
+
172
+ output_data = {
173
+ "model_path": args.model_path,
174
+ "test_path": args.test_path,
175
+ "accuracy_score": accuracy_score,
176
+ "num_results": len(getattr(evaluation_result, "results", []) or []),
177
+ }
178
+
179
+ os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
180
+ with open(args.output_path, "w") as f:
181
+ json.dump(output_data, f, indent=2)
182
+
183
+ print(evaluation_result)
184
+ print(json.dumps(output_data, indent=2))
185
+ except Exception as exc:
186
+ print(f"[error] {type(exc).__name__}: {exc}")
187
+ if args.provide_traceback:
188
+ traceback.print_exc()
189
+ raise
190
+
191
+
192
+ if __name__ == "__main__":
193
+ main()
code/text_classifier/text_classifier_dspy.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dspy
2
+ import json
3
+ import os
4
+ import random
5
+ from typing import Literal
6
+ from dspy.teleprompt import BootstrapFewShotWithRandomSearch
7
+ from dspy.evaluate import Evaluate
8
+
9
+ # --- 1. LLM Configuration ---
10
+ api_file = "/home/mshahidul/api_new.json"
11
+ with open(api_file, "r") as f:
12
+ api_keys = json.load(f)
13
+ openai_api_key = api_keys["openai"]
14
+
15
+ # Student: Local vLLM (Deployment Model)
16
+ vllm_model = dspy.LM(
17
+ model='Qwen/Qwen3-30B-A3B-Instruct-2507',
18
+ api_base="http://172.16.34.29:8030/v1",
19
+ api_key="EMPTY",
20
+ temperature=0.0
21
+ )
22
+
23
+ # Teacher: OpenAI (High-quality rationale generation)
24
+ # Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')
25
+ openai_model_teacher = dspy.LM(model='gpt-5', api_key=openai_api_key)
26
+ openai_model_student = dspy.LM(model='gpt-5-mini', api_key=openai_api_key)
27
+
28
+ # Default LM for DSPy runtime
29
+ # Use the local vLLM for fast iteration; switch to openai_model_student if needed.
30
+ # dspy.configure(lm=vllm_model)
31
+ dspy.configure(lm=openai_model_student)
32
+
33
+ class HealthLiteracySignature(dspy.Signature):
34
+ """
35
+ Analyze the linguistic complexity, use of medical jargon, and sentence
36
+ structure of 'generated_text' relative to 'full_text' to determine
37
+ the health literacy level.
38
+ """
39
+ full_text = dspy.InputField(desc="Original clinical or medical source text containing jargon and technical details.")
40
+ generated_text = dspy.InputField(
41
+ desc="A version of the source text rewritten for a specific audience."
42
+ )
43
+
44
+ literacy_label = dspy.OutputField(
45
+ desc="Classification: low_health_literacy (simple words, no jargon), intermediate_health_literacy (moderate technicality), or proficient_health_literacy (highly technical/original level)."
46
+ )
47
+
48
+ class HealthLiteracyClassifier(dspy.Module):
49
+ def __init__(self):
50
+ super().__init__()
51
+ # Use ChainOfThought for better reasoning on medical jargon
52
+ self.classifier = dspy.ChainOfThought(HealthLiteracySignature)
53
+
54
+ def forward(self, full_text, generated_text):
55
+ return self.classifier(full_text=full_text, generated_text=generated_text)
56
+
57
+ def prepare_data(raw_data, seed=42, train_ratio=0.6):
58
+ labels = [
59
+ "low_health_literacy",
60
+ "intermediate_health_literacy",
61
+ "proficient_health_literacy",
62
+ ]
63
+ rng = random.Random(seed)
64
+ buckets = {label: [] for label in labels}
65
+ for item in raw_data:
66
+ label = item.get("label")
67
+ if label not in buckets:
68
+ continue
69
+ example = dspy.Example(
70
+ full_text=item["fulltext"],
71
+ generated_text=item["diff_label_texts"],
72
+ literacy_label=label, # Matches the Signature field
73
+ ).with_inputs("full_text", "generated_text")
74
+ buckets[label].append(example)
75
+
76
+ min_count = min(len(buckets[label]) for label in labels)
77
+ if min_count == 0:
78
+ raise ValueError("One or more labels has no examples; cannot balance.")
79
+
80
+ per_label_total = min_count
81
+ per_label_train = int(round(per_label_total * train_ratio))
82
+ per_label_train = max(1, min(per_label_train, per_label_total - 1))
83
+
84
+ trainset = []
85
+ testset = []
86
+ for label in labels:
87
+ rng.shuffle(buckets[label])
88
+ selected = buckets[label][:per_label_total]
89
+ trainset.extend(selected[:per_label_train])
90
+ testset.extend(selected[per_label_train:per_label_total])
91
+
92
+ rng.shuffle(trainset)
93
+ rng.shuffle(testset)
94
+ return trainset, testset
95
+
96
+
97
+ import json
98
+ path = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json"
99
+ raw_data = json.load(open(path))
100
+ trainset, testset = prepare_data(raw_data)
101
+
102
+ def _example_to_dict(example):
103
+ return {
104
+ "full_text": example.full_text,
105
+ "generated_text": example.generated_text,
106
+ "literacy_label": example.literacy_label,
107
+ }
108
+
109
+ def save_jsonl(path, examples):
110
+ with open(path, "w") as f:
111
+ for ex in examples:
112
+ f.write(json.dumps(_example_to_dict(ex), ensure_ascii=False) + "\n")
113
+
114
+ train_path = "/home/mshahidul/readctrl/code/text_classifier/train.jsonl"
115
+ test_path = "/home/mshahidul/readctrl/code/text_classifier/test.jsonl"
116
+ save_jsonl(train_path, trainset)
117
+ save_jsonl(test_path, testset)
118
+
119
+ def health_literacy_metric(gold, pred, trace=None):
120
+ if not pred or not hasattr(pred, 'literacy_label'):
121
+ return False
122
+
123
+ gold_label = str(gold.literacy_label).strip().lower()
124
+ pred_label = str(pred.literacy_label).strip().lower()
125
+
126
+ # Simple inclusion check helps if the LLM gets wordy
127
+ return gold_label in pred_label
128
+
129
+ optimizer = BootstrapFewShotWithRandomSearch(
130
+ metric=health_literacy_metric,
131
+ max_bootstrapped_demos=3,
132
+ num_candidate_programs=8,
133
+ teacher_settings=dict(lm=openai_model_teacher)
134
+ )
135
+
136
+ # 3. Compile! This creates the "optimized prompt"
137
+ compiled_classifier = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)
138
+
139
+ evaluator = Evaluate(devset=testset, metric=health_literacy_metric, num_threads=1, display_progress=True)
140
+ evaluation_result = evaluator(compiled_classifier)
141
+ accuracy_score = (
142
+ float(evaluation_result.score)
143
+ if hasattr(evaluation_result, "score")
144
+ else float(evaluation_result)
145
+ )
146
+
147
+ def _extract_usage(record):
148
+ if isinstance(record, dict):
149
+ usage = record.get("usage")
150
+ if usage:
151
+ return usage
152
+ response = record.get("response")
153
+ if isinstance(response, dict) and response.get("usage"):
154
+ return response["usage"]
155
+ return None
156
+
157
+ def calc_cost_usd(lm, price_in_per_1m, price_out_per_1m, price_cached_in_per_1m=None):
158
+ prompt_tokens = 0
159
+ completion_tokens = 0
160
+ cached_tokens = 0
161
+ for record in getattr(lm, "history", []) or []:
162
+ usage = _extract_usage(record)
163
+ if not usage:
164
+ continue
165
+ prompt_tokens += int(usage.get("prompt_tokens", usage.get("input_tokens", 0)) or 0)
166
+ completion_tokens += int(usage.get("completion_tokens", usage.get("output_tokens", 0)) or 0)
167
+ cached_tokens += int(usage.get("cached_tokens", usage.get("prompt_tokens_cached", 0)) or 0)
168
+ cost = (prompt_tokens / 1_000_000) * price_in_per_1m
169
+ cost += (completion_tokens / 1_000_000) * price_out_per_1m
170
+ if price_cached_in_per_1m is not None:
171
+ cost += (cached_tokens / 1_000_000) * price_cached_in_per_1m
172
+ return {
173
+ "prompt_tokens": prompt_tokens,
174
+ "completion_tokens": completion_tokens,
175
+ "cached_tokens": cached_tokens,
176
+ "cost_usd": cost,
177
+ }
178
+
179
+ # Fill these with current OpenAI pricing (USD per 1M tokens).
180
+ GPT5_PRICE_INPUT_PER_1M = 1.25
181
+ GPT5_PRICE_OUTPUT_PER_1M = 10.0
182
+ GPT5_MINI_PRICE_INPUT_PER_1M = 0.25
183
+ GPT5_MINI_PRICE_OUTPUT_PER_1M = 2.0
184
+
185
+ teacher_cost = calc_cost_usd(
186
+ openai_model_teacher,
187
+ GPT5_PRICE_INPUT_PER_1M,
188
+ GPT5_PRICE_OUTPUT_PER_1M,
189
+ )
190
+ student_cost = calc_cost_usd(
191
+ openai_model_student,
192
+ GPT5_MINI_PRICE_INPUT_PER_1M,
193
+ GPT5_MINI_PRICE_OUTPUT_PER_1M,
194
+ )
195
+
196
+ cost_report = {
197
+ "gpt-5": teacher_cost,
198
+ "gpt-5-mini": student_cost,
199
+ }
200
+ folder_name="student-gpt5-mini_teacher-gpt5_v1"
201
+ os.makedirs(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}", exist_ok=True)
202
+ compiled_classifier.save(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/model.json")
203
+
204
+ print(evaluation_result)
205
+ print(json.dumps(cost_report, indent=2))
206
+ with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/accuracy.json", "w") as f:
207
+ json.dump(
208
+ {
209
+ "accuracy_score": accuracy_score,
210
+ "num_results": len(getattr(evaluation_result, "results", []) or []),
211
+ },
212
+ f,
213
+ indent=2,
214
+ )
215
+ with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/cost.json", "w") as f:
216
+ json.dump(cost_report, f, indent=2)
code/text_classifier/text_classifier_dspy_load_and_infer_full.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ from collections import Counter
5
+ from typing import Dict, List, Tuple
6
+
7
+ import dspy
8
+ from tqdm import tqdm
9
+
10
+
11
+ API_FILE = "/home/mshahidul/api_new.json"
12
+ DEFAULT_MODEL_PATH = "/home/mshahidul/readctrl/code/text_classifier/dspy_model/student-gpt5-mini_teacher-gpt5_v1/model.json"
13
+ DEFAULT_DATASET_PATH = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json"
14
+ DEFAULT_OUTPUT_PATH = "/home/mshahidul/readctrl/code/text_classifier/dspy_model/student-gpt5-mini_teacher-gpt5_v1/full_dataset_accuracy.json"
15
+ DEFAULT_PREDICTIONS_PATH = "/home/mshahidul/readctrl/code/text_classifier/dspy_model/student-gpt5-mini_teacher-gpt5_v1/full_dataset_predictions.json"
16
+ DEFAULT_CLEAN_DATASET_PATH = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80_clean200.json"
17
+ DEFAULT_REMOVED_PATH = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80_removed21.json"
18
+ VALID_LABELS = {
19
+ "low_health_literacy",
20
+ "intermediate_health_literacy",
21
+ "proficient_health_literacy",
22
+ }
23
+ LABEL_ORDER = {
24
+ "low_health_literacy": 0,
25
+ "intermediate_health_literacy": 1,
26
+ "proficient_health_literacy": 2,
27
+ }
28
+
29
+
30
+ class HealthLiteracySignature(dspy.Signature):
31
+ """
32
+ Analyze the linguistic complexity, use of medical jargon, and sentence
33
+ structure of 'generated_text' to determine the health literacy level.
34
+ """
35
+
36
+ generated_text = dspy.InputField(
37
+ desc="A version of the source text rewritten for a specific audience."
38
+ )
39
+ literacy_label = dspy.OutputField(
40
+ desc=(
41
+ "Classification: low_health_literacy (simple words, no jargon), "
42
+ "intermediate_health_literacy (moderate technicality), or "
43
+ "proficient_health_literacy (highly technical/original level)."
44
+ )
45
+ )
46
+
47
+
48
+ class HealthLiteracyClassifier(dspy.Module):
49
+ def __init__(self):
50
+ super().__init__()
51
+ self.classifier = dspy.ChainOfThought(HealthLiteracySignature)
52
+
53
+ def forward(self, generated_text):
54
+ return self.classifier(generated_text=generated_text)
55
+
56
+
57
+ def load_openai_key(api_file: str) -> str:
58
+ with open(api_file, "r") as f:
59
+ api_keys = json.load(f)
60
+ if "openai" not in api_keys:
61
+ raise KeyError(f"'openai' key is missing in {api_file}")
62
+ return api_keys["openai"]
63
+
64
+
65
+ def normalize_label(text: str) -> str:
66
+ return str(text or "").strip().lower()
67
+
68
+
69
+ def is_correct(gold_label: str, predicted_label: str) -> bool:
70
+ gold = normalize_label(gold_label)
71
+ pred = normalize_label(predicted_label)
72
+ return gold in pred
73
+
74
+
75
+ def extract_predicted_label(predicted_text: str) -> str:
76
+ pred = normalize_label(predicted_text)
77
+ matched = [label for label in VALID_LABELS if label in pred]
78
+ if len(matched) == 1:
79
+ return matched[0]
80
+ return ""
81
+
82
+
83
+ def misclassification_severity(gold_label: str, predicted_label: str) -> int:
84
+ gold = LABEL_ORDER.get(gold_label)
85
+ pred = LABEL_ORDER.get(predicted_label)
86
+ if gold is None or pred is None:
87
+ # Unknown/unparseable predictions are treated as worst.
88
+ return 3
89
+ return abs(gold - pred)
90
+
91
+
92
+ def load_full_examples(dataset_path: str):
93
+ with open(dataset_path, "r") as f:
94
+ raw_data = json.load(f)
95
+
96
+ examples = []
97
+ for idx, item in enumerate(raw_data):
98
+ label = item.get("label")
99
+ text = item.get("diff_label_texts")
100
+ if label in VALID_LABELS and text:
101
+ examples.append(
102
+ {
103
+ "index": idx,
104
+ "generated_text": text,
105
+ "gold_label": label,
106
+ "doc_id": item.get("doc_id"),
107
+ "raw_item": item,
108
+ }
109
+ )
110
+ if not examples:
111
+ raise ValueError("No valid labeled examples found in dataset.")
112
+ return examples
113
+
114
+
115
+ def choose_indices_to_remove(
116
+ predictions: List[Dict], remove_count: int
117
+ ) -> Tuple[List[Dict], List[int]]:
118
+ def _rank_key(p: Dict):
119
+ return (
120
+ 0 if not p["exact_correct"] else 1,
121
+ -p["severity"],
122
+ 0 if not p["predicted_label"] else 1,
123
+ -len(normalize_label(p["raw_prediction_text"])),
124
+ p["index"],
125
+ )
126
+
127
+ label_sequence = sorted(VALID_LABELS, key=lambda x: LABEL_ORDER[x])
128
+ per_label_all = {label: [] for label in label_sequence}
129
+ per_label_mis = {label: [] for label in label_sequence}
130
+ for p in predictions:
131
+ label = p["gold_label"]
132
+ if label in per_label_all:
133
+ per_label_all[label].append(p)
134
+ if not p["exact_correct"]:
135
+ per_label_mis[label].append(p)
136
+
137
+ for label in label_sequence:
138
+ per_label_all[label].sort(key=_rank_key)
139
+ per_label_mis[label].sort(key=_rank_key)
140
+
141
+ # Balanced quota (approximately equal removals per label).
142
+ num_labels = len(label_sequence)
143
+ base_quota = remove_count // num_labels
144
+ remainder = remove_count % num_labels
145
+ quotas = {label: base_quota for label in label_sequence}
146
+
147
+ # Assign remainder to labels with more misclassified candidates first.
148
+ remainder_order = sorted(
149
+ label_sequence,
150
+ key=lambda label: (-len(per_label_mis[label]), LABEL_ORDER[label]),
151
+ )
152
+ for label in remainder_order[:remainder]:
153
+ quotas[label] += 1
154
+
155
+ removed = []
156
+ removed_indices_set = set()
157
+
158
+ # First pass: satisfy each label quota with misclassified items.
159
+ for label in label_sequence:
160
+ take = min(quotas[label], len(per_label_mis[label]))
161
+ for item in per_label_mis[label][:take]:
162
+ removed.append(item)
163
+ removed_indices_set.add(item["index"])
164
+
165
+ # Second pass: if some quotas could not be met, fill within those labels
166
+ # using next-worst remaining items (can include correctly classified).
167
+ for label in label_sequence:
168
+ needed = quotas[label] - sum(1 for x in removed if x["gold_label"] == label)
169
+ if needed <= 0:
170
+ continue
171
+ candidates = [
172
+ x for x in per_label_all[label] if x["index"] not in removed_indices_set
173
+ ]
174
+ for item in candidates[:needed]:
175
+ removed.append(item)
176
+ removed_indices_set.add(item["index"])
177
+
178
+ # Final pass: if still short (edge cases), fill globally by worst rank.
179
+ if len(removed) < remove_count:
180
+ remaining_global = sorted(
181
+ (p for p in predictions if p["index"] not in removed_indices_set),
182
+ key=_rank_key,
183
+ )
184
+ need = remove_count - len(removed)
185
+ for item in remaining_global[:need]:
186
+ removed.append(item)
187
+ removed_indices_set.add(item["index"])
188
+
189
+ # Keep deterministic order in output by rank.
190
+ removed = sorted(removed, key=_rank_key)[:remove_count]
191
+ removed_indices = sorted(p["index"] for p in removed)
192
+ return removed, removed_indices
193
+
194
+
195
+ def run_inference(
196
+ model_path: str,
197
+ dataset_path: str,
198
+ output_path: str,
199
+ predictions_path: str,
200
+ clean_dataset_path: str,
201
+ removed_path: str,
202
+ target_clean_size: int,
203
+ ):
204
+ openai_api_key = load_openai_key(API_FILE)
205
+ student_lm = dspy.LM(model="gpt-5-mini", api_key=openai_api_key)
206
+ dspy.configure(lm=student_lm)
207
+
208
+ classifier = HealthLiteracyClassifier()
209
+ classifier.load(model_path)
210
+
211
+ examples = load_full_examples(dataset_path)
212
+ total = len(examples)
213
+ if target_clean_size <= 0 or target_clean_size >= total:
214
+ raise ValueError(
215
+ f"target_clean_size must be between 1 and {total - 1}, got {target_clean_size}"
216
+ )
217
+
218
+ remove_count = total - target_clean_size
219
+ correct = 0
220
+ label_totals = Counter()
221
+ label_correct = Counter()
222
+ predictions = []
223
+
224
+ for idx, ex in enumerate(
225
+ tqdm(examples, desc="Classifying full dataset", unit="sample"), start=1
226
+ ):
227
+ pred = classifier(generated_text=ex["generated_text"])
228
+ raw_pred_label = getattr(pred, "literacy_label", "")
229
+ pred_label = extract_predicted_label(raw_pred_label)
230
+ gold_label = ex["gold_label"]
231
+ exact_correct = pred_label == gold_label
232
+ lenient_correct = is_correct(gold_label, raw_pred_label)
233
+ severity = (
234
+ misclassification_severity(gold_label, pred_label) if not exact_correct else 0
235
+ )
236
+
237
+ label_totals[gold_label] += 1
238
+ if lenient_correct:
239
+ correct += 1
240
+ label_correct[gold_label] += 1
241
+
242
+ predictions.append(
243
+ {
244
+ "index": ex["index"],
245
+ "doc_id": ex["doc_id"],
246
+ "gold_label": gold_label,
247
+ "predicted_label": pred_label,
248
+ "raw_prediction_text": raw_pred_label,
249
+ "lenient_correct": lenient_correct,
250
+ "exact_correct": exact_correct,
251
+ "severity": severity,
252
+ "generated_text": ex["generated_text"],
253
+ }
254
+ )
255
+
256
+ if idx % 10 == 0 or idx == total:
257
+ tqdm.write(f"Processed {idx}/{total}")
258
+
259
+ accuracy = correct / total if total else 0.0
260
+ exact_accuracy = (
261
+ sum(1 for p in predictions if p["exact_correct"]) / total if total else 0.0
262
+ )
263
+ per_label_accuracy = {
264
+ label: (
265
+ (label_correct[label] / label_totals[label]) if label_totals[label] else 0.0
266
+ )
267
+ for label in sorted(VALID_LABELS)
268
+ }
269
+ removed_examples, removed_indices = choose_indices_to_remove(predictions, remove_count)
270
+ removed_index_set = set(removed_indices)
271
+ clean_dataset = [
272
+ p["raw_item"]
273
+ for p in examples
274
+ if p["index"] not in removed_index_set
275
+ ]
276
+ removed_dataset = [
277
+ p["raw_item"]
278
+ for p in examples
279
+ if p["index"] in removed_index_set
280
+ ]
281
+
282
+ report = {
283
+ "model_path": model_path,
284
+ "dataset_path": dataset_path,
285
+ "num_examples": total,
286
+ "num_correct": correct,
287
+ "lenient_accuracy": accuracy,
288
+ "exact_accuracy": exact_accuracy,
289
+ "per_label_accuracy": per_label_accuracy,
290
+ "target_clean_size": target_clean_size,
291
+ "removed_count": remove_count,
292
+ "clean_dataset_size": len(clean_dataset),
293
+ "removed_dataset_size": len(removed_dataset),
294
+ "removed_misclassified_count": sum(
295
+ 1 for p in removed_examples if not p["exact_correct"]
296
+ ),
297
+ "removed_per_label": dict(
298
+ Counter(p["gold_label"] for p in removed_examples)
299
+ ),
300
+ }
301
+
302
+ for path in [
303
+ output_path,
304
+ predictions_path,
305
+ clean_dataset_path,
306
+ removed_path,
307
+ ]:
308
+ output_dir = os.path.dirname(path)
309
+ if output_dir:
310
+ os.makedirs(output_dir, exist_ok=True)
311
+
312
+ with open(output_path, "w") as f:
313
+ json.dump(report, f, indent=2)
314
+ with open(predictions_path, "w") as f:
315
+ json.dump(predictions, f, indent=2)
316
+ with open(clean_dataset_path, "w") as f:
317
+ json.dump(clean_dataset, f, indent=2, ensure_ascii=False)
318
+ with open(removed_path, "w") as f:
319
+ json.dump(removed_dataset, f, indent=2, ensure_ascii=False)
320
+
321
+ print(json.dumps(report, indent=2))
322
+ print(f"Saved predictions to: {predictions_path}")
323
+ print(f"Saved clean dataset to: {clean_dataset_path}")
324
+ print(f"Saved removed examples to: {removed_path}")
325
+ print(f"Saved report to: {output_path}")
326
+
327
+
328
+ def main():
329
+ parser = argparse.ArgumentParser(
330
+ description="Load a compiled DSPy classifier and evaluate on full dataset."
331
+ )
332
+ parser.add_argument("--model-path", default=DEFAULT_MODEL_PATH)
333
+ parser.add_argument("--dataset-path", default=DEFAULT_DATASET_PATH)
334
+ parser.add_argument("--output-path", default=DEFAULT_OUTPUT_PATH)
335
+ parser.add_argument("--predictions-path", default=DEFAULT_PREDICTIONS_PATH)
336
+ parser.add_argument("--clean-dataset-path", default=DEFAULT_CLEAN_DATASET_PATH)
337
+ parser.add_argument("--removed-path", default=DEFAULT_REMOVED_PATH)
338
+ parser.add_argument("--target-clean-size", type=int, default=200)
339
+ args = parser.parse_args()
340
+
341
+ run_inference(
342
+ model_path=args.model_path,
343
+ dataset_path=args.dataset_path,
344
+ output_path=args.output_path,
345
+ predictions_path=args.predictions_path,
346
+ clean_dataset_path=args.clean_dataset_path,
347
+ removed_path=args.removed_path,
348
+ target_clean_size=args.target_clean_size,
349
+ )
350
+
351
+
352
+ if __name__ == "__main__":
353
+ main()
code/text_classifier/text_classifier_dspy_only_gen_text.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dspy
2
+ import json
3
+ import os
4
+ import random
5
+ from typing import Literal
6
+ from dspy.teleprompt import BootstrapFewShotWithRandomSearch
7
+ from dspy.evaluate import Evaluate
8
+
9
+ # --- 1. LLM Configuration ---
10
+ api_file = "/home/mshahidul/api_new.json"
11
+ with open(api_file, "r") as f:
12
+ api_keys = json.load(f)
13
+ openai_api_key = api_keys["openai"]
14
+
15
+ # Student: Local vLLM (Deployment Model)
16
+ vllm_model = dspy.LM(
17
+ model='Qwen/Qwen3-30B-A3B-Instruct-2507',
18
+ api_base="http://172.16.34.29:8030/v1",
19
+ api_key="EMPTY",
20
+ temperature=0.0
21
+ )
22
+
23
+ # Teacher: OpenAI (High-quality rationale generation)
24
+ # Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')
25
+ openai_model_teacher = dspy.LM(model='gpt-5', api_key=openai_api_key)
26
+ openai_model_student = dspy.LM(model='gpt-5-mini', api_key=openai_api_key)
27
+
28
+ # Default LM for DSPy runtime
29
+ # Use the local vLLM for fast iteration; switch to openai_model_student if needed.
30
+ # dspy.configure(lm=vllm_model)
31
+ dspy.configure(lm=openai_model_student)
32
+
33
+ class HealthLiteracySignature(dspy.Signature):
34
+ """
35
+ Analyze the linguistic complexity, use of medical jargon, and sentence
36
+ structure of 'generated_text' to determine the health literacy level.
37
+ """
38
+ generated_text = dspy.InputField(
39
+ desc="A version of the source text rewritten for a specific audience."
40
+ )
41
+
42
+ literacy_label = dspy.OutputField(
43
+ desc="Classification: low_health_literacy (simple words, no jargon), intermediate_health_literacy (moderate technicality), or proficient_health_literacy (highly technical/original level)."
44
+ )
45
+
46
+ class HealthLiteracyClassifier(dspy.Module):
47
+ def __init__(self):
48
+ super().__init__()
49
+ # Use ChainOfThought for better reasoning on medical jargon
50
+ self.classifier = dspy.ChainOfThought(HealthLiteracySignature)
51
+
52
+ def forward(self, generated_text):
53
+ return self.classifier(generated_text=generated_text)
54
+
55
+ def prepare_data(raw_data, seed=42, train_ratio=0.6):
56
+ labels = [
57
+ "low_health_literacy",
58
+ "intermediate_health_literacy",
59
+ "proficient_health_literacy",
60
+ ]
61
+ rng = random.Random(seed)
62
+ buckets = {label: [] for label in labels}
63
+ for item in raw_data:
64
+ label = item.get("label")
65
+ if label not in buckets:
66
+ continue
67
+ example = dspy.Example(
68
+ generated_text=item["diff_label_texts"],
69
+ literacy_label=label, # Matches the Signature field
70
+ ).with_inputs("generated_text")
71
+ buckets[label].append(example)
72
+
73
+ min_count = min(len(buckets[label]) for label in labels)
74
+ if min_count == 0:
75
+ raise ValueError("One or more labels has no examples; cannot balance.")
76
+
77
+ per_label_total = min_count
78
+ per_label_train = int(round(per_label_total * train_ratio))
79
+ per_label_train = max(1, min(per_label_train, per_label_total - 1))
80
+
81
+ trainset = []
82
+ testset = []
83
+ for label in labels:
84
+ rng.shuffle(buckets[label])
85
+ selected = buckets[label][:per_label_total]
86
+ trainset.extend(selected[:per_label_train])
87
+ testset.extend(selected[per_label_train:per_label_total])
88
+
89
+ rng.shuffle(trainset)
90
+ rng.shuffle(testset)
91
+ return trainset, testset
92
+
93
+
94
+ import json
95
+ path = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json"
96
+ raw_data = json.load(open(path))
97
+ trainset, testset = prepare_data(raw_data)
98
+
99
+ def _example_to_dict(example):
100
+ return {
101
+ "generated_text": example.generated_text,
102
+ "literacy_label": example.literacy_label,
103
+ }
104
+
105
+ def save_jsonl(path, examples):
106
+ with open(path, "w") as f:
107
+ for ex in examples:
108
+ f.write(json.dumps(_example_to_dict(ex), ensure_ascii=False) + "\n")
109
+
110
+ train_path = "/home/mshahidul/readctrl/code/text_classifier/train.jsonl"
111
+ test_path = "/home/mshahidul/readctrl/code/text_classifier/test.jsonl"
112
+ save_jsonl(train_path, trainset)
113
+ save_jsonl(test_path, testset)
114
+
115
+ def health_literacy_metric(gold, pred, trace=None):
116
+ if not pred or not hasattr(pred, 'literacy_label'):
117
+ return False
118
+
119
+ gold_label = str(gold.literacy_label).strip().lower()
120
+ pred_label = str(pred.literacy_label).strip().lower()
121
+
122
+ # Simple inclusion check helps if the LLM gets wordy
123
+ return gold_label in pred_label
124
+
125
+ optimizer = BootstrapFewShotWithRandomSearch(
126
+ metric=health_literacy_metric,
127
+ max_bootstrapped_demos=3,
128
+ num_candidate_programs=8,
129
+ teacher_settings=dict(lm=openai_model_teacher)
130
+ )
131
+
132
+ # 3. Compile! This creates the "optimized prompt"
133
+ compiled_classifier = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)
134
+
135
+ evaluator = Evaluate(devset=testset, metric=health_literacy_metric, num_threads=1, display_progress=True)
136
+ evaluation_result = evaluator(compiled_classifier)
137
+ accuracy_score = (
138
+ float(evaluation_result.score)
139
+ if hasattr(evaluation_result, "score")
140
+ else float(evaluation_result)
141
+ )
142
+
143
+ def _extract_usage(record):
144
+ if isinstance(record, dict):
145
+ usage = record.get("usage")
146
+ if usage:
147
+ return usage
148
+ response = record.get("response")
149
+ if isinstance(response, dict) and response.get("usage"):
150
+ return response["usage"]
151
+ return None
152
+
153
+ def calc_cost_usd(lm, price_in_per_1m, price_out_per_1m, price_cached_in_per_1m=None):
154
+ prompt_tokens = 0
155
+ completion_tokens = 0
156
+ cached_tokens = 0
157
+ for record in getattr(lm, "history", []) or []:
158
+ usage = _extract_usage(record)
159
+ if not usage:
160
+ continue
161
+ prompt_tokens += int(usage.get("prompt_tokens", usage.get("input_tokens", 0)) or 0)
162
+ completion_tokens += int(usage.get("completion_tokens", usage.get("output_tokens", 0)) or 0)
163
+ cached_tokens += int(usage.get("cached_tokens", usage.get("prompt_tokens_cached", 0)) or 0)
164
+ cost = (prompt_tokens / 1_000_000) * price_in_per_1m
165
+ cost += (completion_tokens / 1_000_000) * price_out_per_1m
166
+ if price_cached_in_per_1m is not None:
167
+ cost += (cached_tokens / 1_000_000) * price_cached_in_per_1m
168
+ return {
169
+ "prompt_tokens": prompt_tokens,
170
+ "completion_tokens": completion_tokens,
171
+ "cached_tokens": cached_tokens,
172
+ "cost_usd": cost,
173
+ }
174
+
175
+ # Fill these with current OpenAI pricing (USD per 1M tokens).
176
+ GPT5_PRICE_INPUT_PER_1M = 1.25
177
+ GPT5_PRICE_OUTPUT_PER_1M = 10.0
178
+ GPT5_MINI_PRICE_INPUT_PER_1M = 0.25
179
+ GPT5_MINI_PRICE_OUTPUT_PER_1M = 2.0
180
+
181
+ teacher_cost = calc_cost_usd(
182
+ openai_model_teacher,
183
+ GPT5_PRICE_INPUT_PER_1M,
184
+ GPT5_PRICE_OUTPUT_PER_1M,
185
+ )
186
+ student_cost = calc_cost_usd(
187
+ openai_model_student,
188
+ GPT5_MINI_PRICE_INPUT_PER_1M,
189
+ GPT5_MINI_PRICE_OUTPUT_PER_1M,
190
+ )
191
+
192
+ cost_report = {
193
+ "gpt-5": teacher_cost,
194
+ "gpt-5-mini": student_cost,
195
+ }
196
+ folder_name="student-gpt5-mini_teacher-gpt5_v1"
197
+ os.makedirs(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}", exist_ok=True)
198
+ compiled_classifier.save(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/model.json")
199
+
200
+ print(evaluation_result)
201
+ print(json.dumps(cost_report, indent=2))
202
+ with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/accuracy.json", "w") as f:
203
+ json.dump(
204
+ {
205
+ "accuracy_score": accuracy_score,
206
+ "num_results": len(getattr(evaluation_result, "results", []) or []),
207
+ },
208
+ f,
209
+ indent=2,
210
+ )
211
+ with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/cost.json", "w") as f:
212
+ json.dump(cost_report, f, indent=2)
code/text_classifier/text_classifier_dspy_vllm.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dspy
2
+ import json
3
+ import os
4
+ import random
5
+ from typing import Literal
6
+ from dspy.teleprompt import BootstrapFewShotWithRandomSearch
7
+ from dspy.evaluate import Evaluate
8
+
9
+ # --- 1. LLM Configuration ---
10
+ api_file = "/home/mshahidul/api_new.json"
11
+ with open(api_file, "r") as f:
12
+ api_keys = json.load(f)
13
+ openai_api_key = api_keys["openai"]
14
+
15
+ # Student: Local vLLM (Deployment Model)
16
+ vllm_model = dspy.LM(
17
+ model="openai/dspy",
18
+ api_base="http://172.16.34.29:8030/v1",
19
+ api_key="EMPTY",
20
+ temperature=0.0
21
+ )
22
+
23
+ # Teacher: OpenAI (High-quality rationale generation)
24
+ # Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')
25
+ openai_model_teacher = dspy.LM(model="gpt-5", api_key=openai_api_key)
26
+
27
+ # Default LM for DSPy runtime
28
+ # Use the local vLLM for fast iteration.
29
+ dspy.configure(lm=vllm_model)
30
+
31
+ class HealthLiteracySignature(dspy.Signature):
32
+ """
33
+ Analyze the linguistic complexity, use of medical jargon, and sentence
34
+ structure of 'generated_text' relative to 'full_text' to determine
35
+ the health literacy level.
36
+ """
37
+ full_text = dspy.InputField(desc="Original clinical or medical source text containing jargon and technical details.")
38
+ generated_text = dspy.InputField(
39
+ desc="A version of the source text rewritten for a specific audience."
40
+ )
41
+
42
+ literacy_label = dspy.OutputField(
43
+ desc="Classification: low_health_literacy (simple words, no jargon), intermediate_health_literacy (moderate technicality), or proficient_health_literacy (highly technical/original level)."
44
+ )
45
+
46
+ class HealthLiteracyClassifier(dspy.Module):
47
+ def __init__(self):
48
+ super().__init__()
49
+ # Use ChainOfThought for better reasoning on medical jargon
50
+ self.classifier = dspy.ChainOfThought(HealthLiteracySignature)
51
+
52
+ def forward(self, full_text, generated_text):
53
+ return self.classifier(full_text=full_text, generated_text=generated_text)
54
+
55
+ def prepare_data(raw_data, seed=42, train_ratio=0.6):
56
+ labels = [
57
+ "low_health_literacy",
58
+ "intermediate_health_literacy",
59
+ "proficient_health_literacy",
60
+ ]
61
+ rng = random.Random(seed)
62
+ buckets = {label: [] for label in labels}
63
+ for item in raw_data:
64
+ label = item.get("label")
65
+ if label not in buckets:
66
+ continue
67
+ example = dspy.Example(
68
+ full_text=item["fulltext"],
69
+ generated_text=item["diff_label_texts"],
70
+ literacy_label=label, # Matches the Signature field
71
+ ).with_inputs("full_text", "generated_text")
72
+ buckets[label].append(example)
73
+
74
+ min_count = min(len(buckets[label]) for label in labels)
75
+ if min_count == 0:
76
+ raise ValueError("One or more labels has no examples; cannot balance.")
77
+
78
+ per_label_total = min_count
79
+ per_label_train = int(round(per_label_total * train_ratio))
80
+ per_label_train = max(1, min(per_label_train, per_label_total - 1))
81
+
82
+ trainset = []
83
+ testset = []
84
+ for label in labels:
85
+ rng.shuffle(buckets[label])
86
+ selected = buckets[label][:per_label_total]
87
+ trainset.extend(selected[:per_label_train])
88
+ testset.extend(selected[per_label_train:per_label_total])
89
+
90
+ rng.shuffle(trainset)
91
+ rng.shuffle(testset)
92
+ return trainset, testset
93
+
94
+
95
+ import json
96
+ path = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json"
97
+ raw_data = json.load(open(path))
98
+ trainset, testset = prepare_data(raw_data)
99
+
100
+ def _example_to_dict(example):
101
+ return {
102
+ "full_text": example.full_text,
103
+ "generated_text": example.generated_text,
104
+ "literacy_label": example.literacy_label,
105
+ }
106
+
107
+ def save_jsonl(path, examples):
108
+ with open(path, "w") as f:
109
+ for ex in examples:
110
+ f.write(json.dumps(_example_to_dict(ex), ensure_ascii=False) + "\n")
111
+
112
+ train_path = "/home/mshahidul/readctrl/code/text_classifier/train.jsonl"
113
+ test_path = "/home/mshahidul/readctrl/code/text_classifier/test.jsonl"
114
+ save_jsonl(train_path, trainset)
115
+ save_jsonl(test_path, testset)
116
+
117
+ def health_literacy_metric(gold, pred, trace=None):
118
+ if not pred or not hasattr(pred, 'literacy_label'):
119
+ return False
120
+
121
+ gold_label = str(gold.literacy_label).strip().lower()
122
+ pred_label = str(pred.literacy_label).strip().lower()
123
+
124
+ # Simple inclusion check helps if the LLM gets wordy
125
+ return gold_label in pred_label
126
+
127
+ optimizer = BootstrapFewShotWithRandomSearch(
128
+ metric=health_literacy_metric,
129
+ max_bootstrapped_demos=3,
130
+ num_candidate_programs=8,
131
+ teacher_settings=dict(lm=openai_model_teacher)
132
+ )
133
+
134
+ # 3. Compile! This creates the "optimized prompt"
135
+ compiled_classifier = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)
136
+
137
+ evaluator = Evaluate(devset=testset, metric=health_literacy_metric, num_threads=1, display_progress=True)
138
+ evaluation_result = evaluator(compiled_classifier)
139
+ accuracy_score = (
140
+ float(evaluation_result.score)
141
+ if hasattr(evaluation_result, "score")
142
+ else float(evaluation_result)
143
+ )
144
+
145
+ def _extract_usage(record):
146
+ if isinstance(record, dict):
147
+ usage = record.get("usage")
148
+ if usage:
149
+ return usage
150
+ response = record.get("response")
151
+ if isinstance(response, dict) and response.get("usage"):
152
+ return response["usage"]
153
+ return None
154
+
155
+ def calc_cost_usd(lm, price_in_per_1m, price_out_per_1m, price_cached_in_per_1m=None):
156
+ prompt_tokens = 0
157
+ completion_tokens = 0
158
+ cached_tokens = 0
159
+ for record in getattr(lm, "history", []) or []:
160
+ usage = _extract_usage(record)
161
+ if not usage:
162
+ continue
163
+ prompt_tokens += int(usage.get("prompt_tokens", usage.get("input_tokens", 0)) or 0)
164
+ completion_tokens += int(usage.get("completion_tokens", usage.get("output_tokens", 0)) or 0)
165
+ cached_tokens += int(usage.get("cached_tokens", usage.get("prompt_tokens_cached", 0)) or 0)
166
+ cost = (prompt_tokens / 1_000_000) * price_in_per_1m
167
+ cost += (completion_tokens / 1_000_000) * price_out_per_1m
168
+ if price_cached_in_per_1m is not None:
169
+ cost += (cached_tokens / 1_000_000) * price_cached_in_per_1m
170
+ return {
171
+ "prompt_tokens": prompt_tokens,
172
+ "completion_tokens": completion_tokens,
173
+ "cached_tokens": cached_tokens,
174
+ "cost_usd": cost,
175
+ }
176
+
177
+ # Fill these with current OpenAI pricing (USD per 1M tokens).
178
+ GPT5_PRICE_INPUT_PER_1M = 1.25
179
+ GPT5_PRICE_OUTPUT_PER_1M = 10.0
180
+
181
+ teacher_cost = calc_cost_usd(
182
+ openai_model_teacher,
183
+ GPT5_PRICE_INPUT_PER_1M,
184
+ GPT5_PRICE_OUTPUT_PER_1M,
185
+ )
186
+
187
+ cost_report = {
188
+ "gpt-5": teacher_cost,
189
+ }
190
+ folder_name = "vllm-qwen3-8b_teacher-gpt5_v1"
191
+ os.makedirs(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}", exist_ok=True)
192
+ compiled_classifier.save(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/model.json")
193
+
194
+ print(evaluation_result)
195
+
196
+ with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/accuracy.json", "w") as f:
197
+ json.dump(
198
+ {
199
+ "accuracy_score": accuracy_score,
200
+ "num_results": len(getattr(evaluation_result, "results", []) or []),
201
+ },
202
+ f,
203
+ indent=2,
204
+ )
205
+ print(json.dumps(cost_report, indent=2))
206
+ with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/cost.json", "w") as f:
207
+ json.dump(cost_report, f, indent=2)
code/text_classifier/text_classifier_dspy_vllm_gen_text_only.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dspy
2
+ import json
3
+ import os
4
+ import random
5
+ from typing import Literal
6
+ from dspy.teleprompt import BootstrapFewShotWithRandomSearch
7
+ from dspy.evaluate import Evaluate
8
+
9
+ # --- 1. LLM Configuration ---
10
+ api_file = "/home/mshahidul/api_new.json"
11
+ with open(api_file, "r") as f:
12
+ api_keys = json.load(f)
13
+ openai_api_key = api_keys["openai"]
14
+
15
+ # Student: Local vLLM (Deployment Model)
16
+ vllm_model = dspy.LM(
17
+ model="openai/dspy",
18
+ api_base="http://172.16.34.21:8040/v1",
19
+ api_key="EMPTY",
20
+ temperature=0.0
21
+ )
22
+ folder_name = "vllm-llama-3.1-8b-awq-int4_teacher-gpt5_v1"
23
+ # Teacher: OpenAI (High-quality rationale generation)
24
+ # Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')
25
+ openai_model_teacher = dspy.LM(model="gpt-5", api_key=openai_api_key)
26
+
27
+ # Default LM for DSPy runtime
28
+ # Use the local vLLM for fast iteration.
29
+ dspy.configure(lm=vllm_model)
30
+
31
+ class HealthLiteracySignature(dspy.Signature):
32
+ """
33
+ Analyze the linguistic complexity, use of medical jargon, and sentence
34
+ structure of 'generated_text' to determine the health literacy level.
35
+ """
36
+ generated_text = dspy.InputField(
37
+ desc="A version of the source text rewritten for a specific audience."
38
+ )
39
+
40
+ literacy_label = dspy.OutputField(
41
+ desc="Classification: low_health_literacy (simple words, no jargon), intermediate_health_literacy (moderate technicality), or proficient_health_literacy (highly technical/original level)."
42
+ )
43
+
44
+ class HealthLiteracyClassifier(dspy.Module):
45
+ def __init__(self):
46
+ super().__init__()
47
+ # Use ChainOfThought for better reasoning on medical jargon
48
+ self.classifier = dspy.ChainOfThought(HealthLiteracySignature)
49
+
50
+ def forward(self, generated_text):
51
+ return self.classifier(generated_text=generated_text)
52
+
53
+ def prepare_data(raw_data, seed=42, train_ratio=0.6):
54
+ labels = [
55
+ "low_health_literacy",
56
+ "intermediate_health_literacy",
57
+ "proficient_health_literacy",
58
+ ]
59
+ rng = random.Random(seed)
60
+ buckets = {label: [] for label in labels}
61
+ for item in raw_data:
62
+ label = item.get("label")
63
+ if label not in buckets:
64
+ continue
65
+ example = dspy.Example(
66
+ generated_text=item["diff_label_texts"],
67
+ literacy_label=label, # Matches the Signature field
68
+ ).with_inputs("generated_text")
69
+ buckets[label].append(example)
70
+
71
+ min_count = min(len(buckets[label]) for label in labels)
72
+ if min_count == 0:
73
+ raise ValueError("One or more labels has no examples; cannot balance.")
74
+
75
+ per_label_total = min_count
76
+ per_label_train = int(round(per_label_total * train_ratio))
77
+ per_label_train = max(1, min(per_label_train, per_label_total - 1))
78
+
79
+ trainset = []
80
+ testset = []
81
+ for label in labels:
82
+ rng.shuffle(buckets[label])
83
+ selected = buckets[label][:per_label_total]
84
+ trainset.extend(selected[:per_label_train])
85
+ testset.extend(selected[per_label_train:per_label_total])
86
+
87
+ rng.shuffle(trainset)
88
+ rng.shuffle(testset)
89
+ return trainset, testset
90
+
91
+
92
+ import json
93
+ path = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json"
94
+ raw_data = json.load(open(path))
95
+ trainset, testset = prepare_data(raw_data)
96
+
97
+ def _example_to_dict(example):
98
+ return {
99
+ "generated_text": example.generated_text,
100
+ "literacy_label": example.literacy_label,
101
+ }
102
+
103
+ def save_jsonl(path, examples):
104
+ with open(path, "w") as f:
105
+ for ex in examples:
106
+ f.write(json.dumps(_example_to_dict(ex), ensure_ascii=False) + "\n")
107
+
108
+ train_path = "/home/mshahidul/readctrl/code/text_classifier/train.jsonl"
109
+ test_path = "/home/mshahidul/readctrl/code/text_classifier/test.jsonl"
110
+ save_jsonl(train_path, trainset)
111
+ save_jsonl(test_path, testset)
112
+
113
+ def health_literacy_metric(gold, pred, trace=None):
114
+ if not pred or not hasattr(pred, 'literacy_label'):
115
+ return False
116
+
117
+ gold_label = str(gold.literacy_label).strip().lower()
118
+ pred_label = str(pred.literacy_label).strip().lower()
119
+
120
+ # Simple inclusion check helps if the LLM gets wordy
121
+ return gold_label in pred_label
122
+
123
+ optimizer = BootstrapFewShotWithRandomSearch(
124
+ metric=health_literacy_metric,
125
+ max_bootstrapped_demos=3,
126
+ num_candidate_programs=8,
127
+ teacher_settings=dict(lm=openai_model_teacher)
128
+ )
129
+
130
+ # 3. Compile! This creates the "optimized prompt"
131
+ compiled_classifier = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)
132
+
133
+ evaluator = Evaluate(devset=testset, metric=health_literacy_metric, num_threads=1, display_progress=True)
134
+ evaluation_result = evaluator(compiled_classifier)
135
+ accuracy_score = (
136
+ float(evaluation_result.score)
137
+ if hasattr(evaluation_result, "score")
138
+ else float(evaluation_result)
139
+ )
140
+
141
+ def _extract_usage(record):
142
+ if isinstance(record, dict):
143
+ usage = record.get("usage")
144
+ if usage:
145
+ return usage
146
+ response = record.get("response")
147
+ if isinstance(response, dict) and response.get("usage"):
148
+ return response["usage"]
149
+ return None
150
+
151
+ def calc_cost_usd(lm, price_in_per_1m, price_out_per_1m, price_cached_in_per_1m=None):
152
+ prompt_tokens = 0
153
+ completion_tokens = 0
154
+ cached_tokens = 0
155
+ for record in getattr(lm, "history", []) or []:
156
+ usage = _extract_usage(record)
157
+ if not usage:
158
+ continue
159
+ prompt_tokens += int(usage.get("prompt_tokens", usage.get("input_tokens", 0)) or 0)
160
+ completion_tokens += int(usage.get("completion_tokens", usage.get("output_tokens", 0)) or 0)
161
+ cached_tokens += int(usage.get("cached_tokens", usage.get("prompt_tokens_cached", 0)) or 0)
162
+ cost = (prompt_tokens / 1_000_000) * price_in_per_1m
163
+ cost += (completion_tokens / 1_000_000) * price_out_per_1m
164
+ if price_cached_in_per_1m is not None:
165
+ cost += (cached_tokens / 1_000_000) * price_cached_in_per_1m
166
+ return {
167
+ "prompt_tokens": prompt_tokens,
168
+ "completion_tokens": completion_tokens,
169
+ "cached_tokens": cached_tokens,
170
+ "cost_usd": cost,
171
+ }
172
+
173
+ # Fill these with current OpenAI pricing (USD per 1M tokens).
174
+ GPT5_PRICE_INPUT_PER_1M = 1.25
175
+ GPT5_PRICE_OUTPUT_PER_1M = 10.0
176
+
177
+ teacher_cost = calc_cost_usd(
178
+ openai_model_teacher,
179
+ GPT5_PRICE_INPUT_PER_1M,
180
+ GPT5_PRICE_OUTPUT_PER_1M,
181
+ )
182
+
183
+ cost_report = {
184
+ "gpt-5": teacher_cost,
185
+ }
186
+
187
+ os.makedirs(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}", exist_ok=True)
188
+ compiled_classifier.save(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/model.json")
189
+
190
+ print(evaluation_result)
191
+
192
+ with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/accuracy.json", "w") as f:
193
+ json.dump(
194
+ {
195
+ "accuracy_score": accuracy_score,
196
+ "num_results": len(getattr(evaluation_result, "results", []) or []),
197
+ },
198
+ f,
199
+ indent=2,
200
+ )
201
+ print(json.dumps(cost_report, indent=2))
202
+ with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/cost.json", "w") as f:
203
+ json.dump(cost_report, f, indent=2)
code/text_classifier/text_classifier_dspy_vllm_test_cpp.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import dspy
5
+ from dspy.evaluate import Evaluate
6
+
7
+
8
+ LLM_CPP_API_BASE = os.environ.get("LLM_CPP_API_BASE", "http://172.16.34.21:8034/v1")
9
+ MODEL_PATH = (
10
+ "/home/mshahidul/readctrl/code/text_classifier/dspy_model/vllm-Meta-Llama-3.1-8B-Instruct_teacher-gpt5_v1/model.json"
11
+ )
12
+ TEST_PATH = "/home/mshahidul/readctrl/code/text_classifier/test.jsonl"
13
+
14
+
15
+ llama_cpp_lm = dspy.LM(
16
+ model="openai/dspy",
17
+ api_base=LLM_CPP_API_BASE,
18
+ api_key="EMPTY",
19
+ temperature=0.0,
20
+ )
21
+ dspy.configure(lm=llama_cpp_lm)
22
+
23
+
24
+ class HealthLiteracySignature(dspy.Signature):
25
+ """
26
+ Analyze the linguistic complexity, use of medical jargon, and sentence
27
+ structure of 'generated_text' to determine the health literacy level.
28
+ """
29
+
30
+ generated_text = dspy.InputField(
31
+ desc="A version of the source text rewritten for a specific audience."
32
+ )
33
+ literacy_label = dspy.OutputField(
34
+ desc=(
35
+ "Classification: low_health_literacy (simple words, no jargon), "
36
+ "intermediate_health_literacy (moderate technicality), or "
37
+ "proficient_health_literacy (highly technical/original level)."
38
+ )
39
+ )
40
+
41
+
42
+ class HealthLiteracyClassifier(dspy.Module):
43
+ def __init__(self):
44
+ super().__init__()
45
+ self.classifier = dspy.ChainOfThought(HealthLiteracySignature)
46
+
47
+ def forward(self, generated_text):
48
+ return self.classifier(generated_text=generated_text)
49
+
50
+
51
+ def load_testset(path):
52
+ examples = []
53
+ with open(path, "r") as f:
54
+ for line in f:
55
+ if not line.strip():
56
+ continue
57
+ record = json.loads(line)
58
+ example = dspy.Example(
59
+ generated_text=record["generated_text"],
60
+ literacy_label=record["literacy_label"],
61
+ ).with_inputs("generated_text")
62
+ examples.append(example)
63
+ return examples
64
+
65
+
66
+ def health_literacy_metric(gold, pred, trace=None):
67
+ if not pred or not hasattr(pred, "literacy_label"):
68
+ return False
69
+
70
+ gold_label = str(gold.literacy_label).strip().lower()
71
+ pred_label = str(pred.literacy_label).strip().lower()
72
+ return gold_label in pred_label
73
+
74
+
75
+ def load_compiled_classifier(path):
76
+ if hasattr(dspy, "load"):
77
+ try:
78
+ return dspy.load(path)
79
+ except Exception:
80
+ pass
81
+ classifier = HealthLiteracyClassifier()
82
+ try:
83
+ classifier.load(path)
84
+ except Exception as exc:
85
+ raise RuntimeError(f"Failed to load compiled model from {path}") from exc
86
+ return classifier
87
+
88
+
89
+ def main():
90
+ if not os.path.exists(MODEL_PATH):
91
+ raise FileNotFoundError(f"Model file not found: {MODEL_PATH}")
92
+ if not os.path.exists(TEST_PATH):
93
+ raise FileNotFoundError(f"Test file not found: {TEST_PATH}")
94
+
95
+ testset = load_testset(TEST_PATH)
96
+ compiled_classifier = load_compiled_classifier(MODEL_PATH)
97
+
98
+ evaluator = Evaluate(
99
+ devset=testset,
100
+ metric=health_literacy_metric,
101
+ num_threads=1,
102
+ display_progress=True,
103
+ )
104
+ evaluation_result = evaluator(compiled_classifier)
105
+ accuracy_score = (
106
+ float(evaluation_result.score)
107
+ if hasattr(evaluation_result, "score")
108
+ else float(evaluation_result)
109
+ )
110
+ print(evaluation_result)
111
+ print(f"accuracy_score: {accuracy_score}")
112
+
113
+
114
+ if __name__ == "__main__":
115
+ main()
code/translation_quality_check/calc_comet_bertscore_from_jsonl.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Compute BERTScore and COMET from saved translations.jsonl output.
4
+
5
+ Expected JSONL fields per row:
6
+ - target_language_file
7
+ - direction (e.g., en_to_es)
8
+ - source_text
9
+ - reference_text
10
+ - hypothesis_text
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import csv
17
+ import json
18
+ import os
19
+ from collections import defaultdict
20
+ from pathlib import Path
21
+ from typing import Dict, List, Optional, Tuple
22
+
23
+
24
+ def parse_args() -> argparse.Namespace:
25
+ parser = argparse.ArgumentParser(
26
+ description="Calculate COMET and BERTScore from translations.jsonl"
27
+ )
28
+ parser.add_argument(
29
+ "--input-jsonl",
30
+ default="/home/mshahidul/readctrl/code/translation_quality_check/run_20260214_201430/translations.jsonl",
31
+ help="Path to translations.jsonl",
32
+ )
33
+ parser.add_argument(
34
+ "--output-json",
35
+ default="",
36
+ help="Output JSON path (default: beside input as score_comet_bertscore.json)",
37
+ )
38
+ parser.add_argument(
39
+ "--output-csv",
40
+ default="",
41
+ help="Output CSV path (default: beside input as score_comet_bertscore.csv)",
42
+ )
43
+ parser.add_argument(
44
+ "--summary-csv",
45
+ default="",
46
+ help="Optional summary.csv to update with bertscore_f1 and comet",
47
+ )
48
+ parser.add_argument(
49
+ "--skip-bertscore",
50
+ action="store_true",
51
+ help="Skip BERTScore",
52
+ )
53
+ parser.add_argument(
54
+ "--skip-comet",
55
+ action="store_true",
56
+ help="Skip COMET",
57
+ )
58
+ parser.add_argument(
59
+ "--comet-model",
60
+ default="Unbabel/wmt22-comet-da",
61
+ help="COMET model name for download_model",
62
+ )
63
+ parser.add_argument(
64
+ "--batch-size",
65
+ type=int,
66
+ default=8,
67
+ help="Batch size for COMET prediction",
68
+ )
69
+ return parser.parse_args()
70
+
71
+
72
+ def load_jsonl(path: Path) -> List[dict]:
73
+ rows: List[dict] = []
74
+ with path.open("r", encoding="utf-8") as f:
75
+ for line_no, line in enumerate(f, start=1):
76
+ line = line.strip()
77
+ if not line:
78
+ continue
79
+ try:
80
+ rows.append(json.loads(line))
81
+ except json.JSONDecodeError as exc:
82
+ raise ValueError(f"Invalid JSON at line {line_no} in {path}: {exc}") from exc
83
+ return rows
84
+
85
+
86
+ def direction_target_lang(direction: str) -> str:
87
+ # Expected format: src_to_tgt
88
+ parts = direction.split("_to_")
89
+ if len(parts) != 2:
90
+ return "en"
91
+ return parts[1].strip().lower()
92
+
93
+
94
+ def compute_bertscore(
95
+ hyps: List[str], refs: List[str], target_lang: str
96
+ ) -> Optional[float]:
97
+ try:
98
+ from bert_score import score as bert_score_fn # type: ignore
99
+ except Exception as exc:
100
+ print(
101
+ "[WARN] Could not import bert_score. "
102
+ "Install with: pip install bert-score\n"
103
+ f" Details: {exc}"
104
+ )
105
+ return None
106
+ # BERTScore supports short language codes like en/es/fr/pt.
107
+ _, _, f1 = bert_score_fn(hyps, refs, lang=target_lang, verbose=False)
108
+ return round(float(f1.mean().item()), 6)
109
+
110
+
111
+ def compute_comet(
112
+ srcs: List[str],
113
+ hyps: List[str],
114
+ refs: List[str],
115
+ model_name: str,
116
+ batch_size: int,
117
+ ) -> Optional[float]:
118
+ try:
119
+ from comet import download_model, load_from_checkpoint # type: ignore
120
+ except Exception as exc:
121
+ print(
122
+ "[WARN] Could not import comet. "
123
+ "Install with: pip install unbabel-comet\n"
124
+ f" Details: {exc}"
125
+ )
126
+ return None
127
+
128
+ model_path = download_model(model_name)
129
+ comet_model = load_from_checkpoint(model_path)
130
+ data = [{"src": s, "mt": h, "ref": r} for s, h, r in zip(srcs, hyps, refs)]
131
+ result = comet_model.predict(
132
+ data,
133
+ batch_size=batch_size,
134
+ gpus=1 if os.environ.get("CUDA_VISIBLE_DEVICES") else 0,
135
+ )
136
+ return round(float(result.system_score), 6)
137
+
138
+
139
+ def write_json(path: Path, payload: dict) -> None:
140
+ with path.open("w", encoding="utf-8") as f:
141
+ json.dump(payload, f, ensure_ascii=False, indent=2)
142
+
143
+
144
+ def write_csv(path: Path, rows: List[dict]) -> None:
145
+ cols = [
146
+ "language_file",
147
+ "direction",
148
+ "n_samples",
149
+ "bertscore_f1",
150
+ "comet",
151
+ ]
152
+ with path.open("w", encoding="utf-8", newline="") as f:
153
+ writer = csv.DictWriter(f, fieldnames=cols)
154
+ writer.writeheader()
155
+ writer.writerows(rows)
156
+
157
+
158
+ def maybe_update_summary_csv(summary_path: Path, metrics_rows: List[dict]) -> Path:
159
+ metric_lookup: Dict[Tuple[str, str], dict] = {
160
+ (row["language_file"], row["direction"]): row for row in metrics_rows
161
+ }
162
+ with summary_path.open("r", encoding="utf-8") as f:
163
+ reader = csv.DictReader(f)
164
+ src_rows = list(reader)
165
+ cols = list(reader.fieldnames or [])
166
+
167
+ if "bertscore_f1" not in cols:
168
+ cols.append("bertscore_f1")
169
+ if "comet" not in cols:
170
+ cols.append("comet")
171
+
172
+ out_rows: List[dict] = []
173
+ for row in src_rows:
174
+ key = (row.get("language_file", ""), row.get("direction", ""))
175
+ m = metric_lookup.get(key)
176
+ if m:
177
+ row["bertscore_f1"] = m.get("bertscore_f1", "")
178
+ row["comet"] = m.get("comet", "")
179
+ out_rows.append(row)
180
+
181
+ out_path = summary_path.with_name(f"{summary_path.stem}_with_comet_bertscore.csv")
182
+ with out_path.open("w", encoding="utf-8", newline="") as f:
183
+ writer = csv.DictWriter(f, fieldnames=cols)
184
+ writer.writeheader()
185
+ writer.writerows(out_rows)
186
+ return out_path
187
+
188
+
189
+ def main() -> None:
190
+ args = parse_args()
191
+ input_path = Path(args.input_jsonl)
192
+ if not input_path.exists():
193
+ raise FileNotFoundError(f"Input not found: {input_path}")
194
+
195
+ out_json = (
196
+ Path(args.output_json)
197
+ if args.output_json
198
+ else input_path.with_name("score_comet_bertscore.json")
199
+ )
200
+ out_csv = (
201
+ Path(args.output_csv)
202
+ if args.output_csv
203
+ else input_path.with_name("score_comet_bertscore.csv")
204
+ )
205
+
206
+ rows = load_jsonl(input_path)
207
+ if not args.skip_bertscore:
208
+ print("[info] BERTScore enabled")
209
+ if not args.skip_comet:
210
+ print("[info] COMET enabled")
211
+ groups: Dict[Tuple[str, str], List[dict]] = defaultdict(list)
212
+ for r in rows:
213
+ lang_file = str(r.get("target_language_file", "")).strip()
214
+ direction = str(r.get("direction", "")).strip()
215
+ if not lang_file or not direction:
216
+ continue
217
+ groups[(lang_file, direction)].append(r)
218
+
219
+ score_rows: List[dict] = []
220
+ payload = {
221
+ "input_jsonl": str(input_path),
222
+ "scores": {},
223
+ }
224
+
225
+ for (lang_file, direction), group_rows in sorted(groups.items()):
226
+ srcs = [str(x.get("source_text", "")) for x in group_rows]
227
+ refs = [str(x.get("reference_text", "")) for x in group_rows]
228
+ hyps = [str(x.get("hypothesis_text", "")) for x in group_rows]
229
+
230
+ tgt_lang = direction_target_lang(direction)
231
+ bert = None if args.skip_bertscore else compute_bertscore(hyps, refs, tgt_lang)
232
+ comet = None
233
+ if not args.skip_comet:
234
+ comet = compute_comet(
235
+ srcs=srcs,
236
+ hyps=hyps,
237
+ refs=refs,
238
+ model_name=args.comet_model,
239
+ batch_size=args.batch_size,
240
+ )
241
+
242
+ row = {
243
+ "language_file": lang_file,
244
+ "direction": direction,
245
+ "n_samples": len(group_rows),
246
+ "bertscore_f1": bert if bert is not None else "",
247
+ "comet": comet if comet is not None else "",
248
+ }
249
+ score_rows.append(row)
250
+ payload["scores"].setdefault(lang_file, {})[direction] = {
251
+ "n_samples": len(group_rows),
252
+ "bertscore_f1": bert,
253
+ "comet": comet,
254
+ }
255
+ print(
256
+ f"[done] {lang_file} {direction}: "
257
+ f"bertscore_f1={row['bertscore_f1']} comet={row['comet']}"
258
+ )
259
+
260
+ write_json(out_json, payload)
261
+ write_csv(out_csv, score_rows)
262
+ print(f"\nSaved JSON: {out_json}")
263
+ print(f"Saved CSV: {out_csv}")
264
+
265
+ if args.summary_csv:
266
+ summary_path = Path(args.summary_csv)
267
+ if not summary_path.exists():
268
+ raise FileNotFoundError(f"summary.csv not found: {summary_path}")
269
+ merged_path = maybe_update_summary_csv(summary_path, score_rows)
270
+ print(f"Saved merged summary: {merged_path}")
271
+
272
+
273
+ if __name__ == "__main__":
274
+ main()
code/translation_quality_check/eval_gpt52_translation.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Evaluate GPT-5.2 translation quality on MultiClinSum files.
4
+
5
+ What this script does:
6
+ 1) Loads EN/ES/FR/PT json files (expects fields like id/fulltext/summary)
7
+ 2) Aligns EN with each non-EN language by shared numeric case id
8
+ 3) Samples N aligned instances per language pair
9
+ 4) Runs bidirectional translation with GPT-5.2:
10
+ - EN -> X
11
+ - X -> EN
12
+ 5) Reports common MT metrics used in top venues:
13
+ - BLEU (sacreBLEU)
14
+ - chrF++ (sacreBLEU chrF)
15
+ - COMET (if installed)
16
+ - BERTScore F1 (if installed)
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import csv
23
+ import json
24
+ import os
25
+ import random
26
+ import re
27
+ import sys
28
+ import time
29
+ from dataclasses import dataclass
30
+ from datetime import datetime
31
+ from pathlib import Path
32
+ from typing import Dict, List, Optional
33
+
34
+ from openai import OpenAI
35
+ import sacrebleu
36
+
37
+
38
+ ID_NUM_RE = re.compile(r"_(\d+)\.txt$")
39
+
40
+
41
+ @dataclass
42
+ class Example:
43
+ case_id: str
44
+ text: str
45
+ raw_id: str
46
+
47
+
48
+ def parse_args() -> argparse.Namespace:
49
+ parser = argparse.ArgumentParser(description="GPT-5.2 translation evaluation")
50
+ parser.add_argument(
51
+ "--en-file",
52
+ default="/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json",
53
+ help="Path to English json file",
54
+ )
55
+ parser.add_argument(
56
+ "--es-file",
57
+ default="/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_es.json",
58
+ help="Path to Spanish json file",
59
+ )
60
+ parser.add_argument(
61
+ "--fr-file",
62
+ default="/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_fr.json",
63
+ help="Path to French json file",
64
+ )
65
+ parser.add_argument(
66
+ "--pt-file",
67
+ default="/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_pt.json",
68
+ help="Path to Portuguese json file",
69
+ )
70
+ parser.add_argument(
71
+ "--num-samples",
72
+ type=int,
73
+ default=20,
74
+ help="Samples per language pair",
75
+ )
76
+ parser.add_argument("--seed", type=int, default=42, help="Random seed")
77
+ parser.add_argument(
78
+ "--model",
79
+ default="gpt-5.2",
80
+ help="OpenAI model name",
81
+ )
82
+ parser.add_argument(
83
+ "--max-chars",
84
+ type=int,
85
+ default=2500,
86
+ help="Character cap per sample to control cost/latency",
87
+ )
88
+ parser.add_argument(
89
+ "--api-file",
90
+ default="/home/mshahidul/api_new.json",
91
+ help="JSON file containing API keys (expects key 'openai')",
92
+ )
93
+ parser.add_argument(
94
+ "--output-dir",
95
+ default="/home/mshahidul/readctrl/code/translation_quality_check",
96
+ help="Directory to save outputs",
97
+ )
98
+ parser.add_argument(
99
+ "--skip-comet",
100
+ action="store_true",
101
+ help="Skip COMET even if installed",
102
+ )
103
+ parser.add_argument(
104
+ "--skip-bertscore",
105
+ action="store_true",
106
+ help="Skip BERTScore even if installed",
107
+ )
108
+ parser.add_argument(
109
+ "--temperature",
110
+ type=float,
111
+ default=0.0,
112
+ help="Decoding temperature",
113
+ )
114
+ parser.add_argument(
115
+ "--save-every",
116
+ type=int,
117
+ default=10,
118
+ help="Checkpoint save interval (in translated instances)",
119
+ )
120
+ return parser.parse_args()
121
+
122
+
123
+ def load_json(path: str) -> List[dict]:
124
+ with open(path, "r", encoding="utf-8") as f:
125
+ return json.load(f)
126
+
127
+
128
+ def normalize_case_id(raw_id: str) -> str:
129
+ m = ID_NUM_RE.search(raw_id)
130
+ if m:
131
+ return m.group(1)
132
+ return raw_id
133
+
134
+
135
+ def dataset_to_examples(rows: List[dict], field: str) -> Dict[str, Example]:
136
+ out: Dict[str, Example] = {}
137
+ for row in rows:
138
+ raw_id = str(row.get("id", ""))
139
+ case_id = normalize_case_id(raw_id)
140
+ text = row.get(field)
141
+ if text is None:
142
+ text = row.get("summary") or row.get("fulltext") or ""
143
+ text = str(text).strip()
144
+ if not text:
145
+ continue
146
+ out[case_id] = Example(case_id=case_id, text=text, raw_id=raw_id)
147
+ return out
148
+
149
+
150
+ def truncate_text(text: str, max_chars: int) -> str:
151
+ if max_chars <= 0:
152
+ return text
153
+ if len(text) <= max_chars:
154
+ return text
155
+ return text[:max_chars].rstrip() + " ..."
156
+
157
+
158
+ def translate_one(
159
+ client: OpenAI,
160
+ model: str,
161
+ text: str,
162
+ src_lang_name: str,
163
+ tgt_lang_name: str,
164
+ temperature: float,
165
+ ) -> str:
166
+ system = (
167
+ "You are a professional medical translator for clinical text. "
168
+ "Your top priority is fidelity and patient-safety: do not hallucinate, "
169
+ "do not add, remove, infer, or normalize medical content that is not explicitly present. "
170
+ "Preserve the original meaning, uncertainty, negation, severity, temporality, "
171
+ "numbers, units, dosages, lab values, abbreviations, named entities, and terminology. "
172
+ "If a term is ambiguous, keep the closest literal translation rather than guessing. "
173
+ "Keep formatting and sentence boundaries as close as possible to the source. "
174
+ "Return only the translated text, with no explanation."
175
+ )
176
+ user = (
177
+ f"Translate the following medical text from {src_lang_name} to {tgt_lang_name}.\n"
178
+ "Strict rules: no extra information, no paraphrased additions, no clinical assumptions.\n\n"
179
+ f"{text}"
180
+ )
181
+ response = client.responses.create(
182
+ model=model,
183
+ input=[
184
+ {"role": "system", "content": system},
185
+ {"role": "user", "content": user},
186
+ ],
187
+ )
188
+ return response.output_text.strip()
189
+
190
+
191
+ def compute_bleu_chrf(hypotheses: List[str], references: List[str]) -> Dict[str, float]:
192
+ bleu = sacrebleu.corpus_bleu(hypotheses, [references]).score
193
+ chrf = sacrebleu.corpus_chrf(hypotheses, [references]).score
194
+ return {"bleu": round(bleu, 4), "chrf++": round(chrf, 4)}
195
+
196
+
197
+ def maybe_compute_bertscore(
198
+ hypotheses: List[str],
199
+ references: List[str],
200
+ target_lang: str,
201
+ ) -> Optional[float]:
202
+ try:
203
+ from bert_score import score as bert_score_fn # type: ignore
204
+ except Exception:
205
+ return None
206
+ _, _, f1 = bert_score_fn(hypotheses, references, lang=target_lang, verbose=False)
207
+ return round(float(f1.mean().item()), 6)
208
+
209
+
210
+ def maybe_compute_comet(
211
+ sources: List[str],
212
+ hypotheses: List[str],
213
+ references: List[str],
214
+ ) -> Optional[float]:
215
+ try:
216
+ from comet import download_model, load_from_checkpoint # type: ignore
217
+ except Exception:
218
+ return None
219
+ model_path = download_model("Unbabel/wmt22-comet-da")
220
+ comet_model = load_from_checkpoint(model_path)
221
+ data = [{"src": s, "mt": h, "ref": r} for s, h, r in zip(sources, hypotheses, references)]
222
+ result = comet_model.predict(data, batch_size=8, gpus=1 if os.environ.get("CUDA_VISIBLE_DEVICES") else 0)
223
+ return round(float(result.system_score), 6)
224
+
225
+
226
+ def ensure_dir(path: str) -> None:
227
+ Path(path).mkdir(parents=True, exist_ok=True)
228
+
229
+
230
+ def persist_outputs(
231
+ json_path: Path,
232
+ details_path: Path,
233
+ csv_path: Path,
234
+ all_results: dict,
235
+ detailed_rows: List[dict],
236
+ summary_rows: List[dict],
237
+ ) -> None:
238
+ with open(json_path, "w", encoding="utf-8") as f:
239
+ json.dump(all_results, f, ensure_ascii=False, indent=2)
240
+
241
+ with open(details_path, "w", encoding="utf-8") as f:
242
+ for row in detailed_rows:
243
+ f.write(json.dumps(row, ensure_ascii=False) + "\n")
244
+
245
+ cols = [
246
+ "language_file",
247
+ "direction",
248
+ "n_samples",
249
+ "bleu",
250
+ "chrf++",
251
+ "bertscore_f1",
252
+ "comet",
253
+ "elapsed_sec",
254
+ ]
255
+ with open(csv_path, "w", encoding="utf-8", newline="") as f:
256
+ writer = csv.DictWriter(f, fieldnames=cols)
257
+ writer.writeheader()
258
+ if summary_rows:
259
+ writer.writerows(summary_rows)
260
+
261
+
262
+ def resolve_openai_api_key(api_file: str) -> str:
263
+ # Keep same loading pattern used in diff_label_text_creation_bangla.py.
264
+ with open(api_file, "r", encoding="utf-8") as f:
265
+ api_keys = json.load(f)
266
+ return str(api_keys["openai"])
267
+
268
+
269
+ def main() -> None:
270
+ args = parse_args()
271
+ api_key = resolve_openai_api_key(args.api_file)
272
+
273
+ rng = random.Random(args.seed)
274
+ client = OpenAI(api_key=api_key)
275
+
276
+ en_rows = load_json(args.en_file)
277
+ lang_files = {"es": args.es_file, "fr": args.fr_file, "pt": args.pt_file}
278
+
279
+ field = "fulltext"
280
+ en_map = dataset_to_examples(en_rows, field)
281
+ lang_maps = {
282
+ lang: dataset_to_examples(load_json(path), field)
283
+ for lang, path in lang_files.items()
284
+ }
285
+
286
+ lang_name = {"en": "English", "es": "Spanish", "fr": "French", "pt": "Portuguese"}
287
+ bert_lang = {"en": "en", "es": "es", "fr": "fr", "pt": "pt"}
288
+
289
+ timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
290
+ run_dir = Path(args.output_dir) / f"run_{timestamp}"
291
+ ensure_dir(str(run_dir))
292
+
293
+ all_results = {
294
+ "run_time_utc": datetime.utcnow().isoformat(),
295
+ "settings": {
296
+ "model": args.model,
297
+ "field": field,
298
+ "num_samples": args.num_samples,
299
+ "max_chars": args.max_chars,
300
+ "seed": args.seed,
301
+ "files": {
302
+ "en": args.en_file,
303
+ "es": args.es_file,
304
+ "fr": args.fr_file,
305
+ "pt": args.pt_file,
306
+ },
307
+ },
308
+ "scores": {},
309
+ }
310
+
311
+ detailed_rows: List[dict] = []
312
+ summary_rows: List[dict] = []
313
+ all_results["partial_scores"] = {}
314
+
315
+ json_path = run_dir / "scores.json"
316
+ details_path = run_dir / "translations.jsonl"
317
+ csv_path = run_dir / "summary.csv"
318
+
319
+ for tgt_lang, tgt_map in lang_maps.items():
320
+ common_ids = sorted(set(en_map.keys()) & set(tgt_map.keys()))
321
+ if not common_ids:
322
+ print(f"[WARN] No aligned IDs between en and {tgt_lang}. Skipping.")
323
+ continue
324
+ k = min(args.num_samples, len(common_ids))
325
+ sampled_ids = rng.sample(common_ids, k=k)
326
+
327
+ pair_results = {}
328
+ print(f"[INFO] Evaluating EN <-> {tgt_lang.upper()} with {k} samples")
329
+
330
+ directions = [("en", tgt_lang), (tgt_lang, "en")]
331
+ for src_lang, out_lang in directions:
332
+ sources: List[str] = []
333
+ refs: List[str] = []
334
+ hyps: List[str] = []
335
+
336
+ start = time.time()
337
+ for idx, case_id in enumerate(sampled_ids, start=1):
338
+ src_ex = en_map[case_id] if src_lang == "en" else tgt_map[case_id]
339
+ ref_ex = tgt_map[case_id] if out_lang == tgt_lang else en_map[case_id]
340
+
341
+ src_text = truncate_text(src_ex.text, args.max_chars)
342
+ ref_text = truncate_text(ref_ex.text, args.max_chars)
343
+
344
+ hyp = translate_one(
345
+ client=client,
346
+ model=args.model,
347
+ text=src_text,
348
+ src_lang_name=lang_name[src_lang],
349
+ tgt_lang_name=lang_name[out_lang],
350
+ temperature=args.temperature,
351
+ )
352
+
353
+ sources.append(src_text)
354
+ refs.append(ref_text)
355
+ hyps.append(hyp)
356
+
357
+ detailed_rows.append(
358
+ {
359
+ "target_language_file": tgt_lang,
360
+ "direction": f"{src_lang}_to_{out_lang}",
361
+ "case_id": case_id,
362
+ "src_raw_id": src_ex.raw_id,
363
+ "ref_raw_id": ref_ex.raw_id,
364
+ "source_text": src_text,
365
+ "reference_text": ref_text,
366
+ "hypothesis_text": hyp,
367
+ }
368
+ )
369
+ print(
370
+ f" [{src_lang}->{out_lang}] {idx}/{k} done "
371
+ f"(case_id={case_id})"
372
+ )
373
+
374
+ if args.save_every > 0 and (idx % args.save_every == 0):
375
+ partial_key = f"{tgt_lang}:{src_lang}_to_{out_lang}"
376
+ all_results["partial_scores"][partial_key] = {
377
+ "completed": idx,
378
+ "total": k,
379
+ **compute_bleu_chrf(hyps, refs),
380
+ }
381
+ persist_outputs(
382
+ json_path=json_path,
383
+ details_path=details_path,
384
+ csv_path=csv_path,
385
+ all_results=all_results,
386
+ detailed_rows=detailed_rows,
387
+ summary_rows=summary_rows,
388
+ )
389
+ print(
390
+ f" [checkpoint] saved at {idx}/{k} "
391
+ f"for {src_lang}->{out_lang}"
392
+ )
393
+
394
+ metric_dict = compute_bleu_chrf(hyps, refs)
395
+ if not args.skip_bertscore:
396
+ bs = maybe_compute_bertscore(hyps, refs, bert_lang[out_lang])
397
+ metric_dict["bertscore_f1"] = bs if bs is not None else None
398
+ if not args.skip_comet:
399
+ comet = maybe_compute_comet(sources, hyps, refs)
400
+ metric_dict["comet"] = comet if comet is not None else None
401
+
402
+ metric_dict["n_samples"] = k
403
+ metric_dict["elapsed_sec"] = round(time.time() - start, 2)
404
+ key = f"{src_lang}_to_{out_lang}"
405
+ pair_results[key] = metric_dict
406
+
407
+ summary_rows.append(
408
+ {
409
+ "language_file": tgt_lang,
410
+ "direction": key,
411
+ **metric_dict,
412
+ }
413
+ )
414
+
415
+ all_results["scores"][tgt_lang] = pair_results
416
+
417
+ persist_outputs(
418
+ json_path=json_path,
419
+ details_path=details_path,
420
+ csv_path=csv_path,
421
+ all_results=all_results,
422
+ detailed_rows=detailed_rows,
423
+ summary_rows=summary_rows,
424
+ )
425
+
426
+ print("\n=== Translation Evaluation Complete ===")
427
+ print(f"Run directory: {run_dir}")
428
+ print(f"Scores JSON: {json_path}")
429
+ print(f"Summary CSV: {csv_path}")
430
+ print(f"Details JSONL: {details_path}")
431
+
432
+
433
+ if __name__ == "__main__":
434
+ try:
435
+ main()
436
+ except KeyboardInterrupt:
437
+ print("\nInterrupted by user.")
438
+ sys.exit(130)
code/validation/data_gen_subclaims_support_valid_ch_gpt5.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import json, os
3
+
4
+ with open("/home/mshahidul/readctrl/prompts/subclaim_result_generate_gpt5.txt", "r") as f:
5
+ prompt_template = f.read()
6
+
7
+
8
+ api_file = "/home/mshahidul/api_new.json"
9
+ with open(api_file, "r") as f:
10
+ api_keys = json.load(f)
11
+ openai_api_key = api_keys["openai"]
12
+
13
+ client = OpenAI(api_key=openai_api_key)
14
+
15
+
16
+ def openai_return(prompt, model="gpt-5"):
17
+ """Send a prompt to GPT and parse JSON."""
18
+ response = client.chat.completions.create(
19
+ model=model,
20
+ messages=[
21
+ {"role": "system", "content": "You are a helpful assistant."},
22
+ {"role": "user", "content": prompt}
23
+ ]
24
+ )
25
+ content = response.choices[0].message.content.strip()
26
+ cleaned = content.replace("```json", "").replace("```", "").strip()
27
+ try:
28
+ return json.loads(cleaned)
29
+ except json.JSONDecodeError:
30
+ print("⚠️ JSON parse failed — storing raw text.")
31
+ return cleaned
32
+
33
+ with open("/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_full_data.json", "r") as f:
34
+ data = json.load(f)
35
+
36
+ save_path="/home/mshahidul/readctrl/data/model_validity_check/subclaims_support_validity_check_gt_gpt5(1-5).json"
37
+ res=[]
38
+ if os.path.exists(save_path):
39
+ with open(save_path, "r") as f:
40
+ res = json.load(f)
41
+ import tqdm
42
+ for i in tqdm.tqdm(range(5)):
43
+ for label in ["easy", "intermediate", "hard"]:
44
+ new_prompt = prompt_template.replace("<<<DOCUMENT>>>",data[i]['fulltext']).replace("<<<SUBCLAIMS>>>", json.dumps(data[i][f'{label}_subclaims'], indent=2, ensure_ascii=False))
45
+ # import ipdb; ipdb.set_trace()
46
+ sample = openai_return(new_prompt, model="gpt-5")
47
+
48
+ res.append(sample)
49
+ if len(res) % 2 == 0:
50
+ with open(save_path, "w") as f:
51
+ json.dump(res, f, indent=2, ensure_ascii=False)
52
+ print(f"Saved {len(res)} samples so far.")
53
+
54
+ with open(save_path, "w") as f:
55
+ json.dump(res, f, indent=2, ensure_ascii=False)
56
+
code/validation/subclaims_extr_valid_check_gpt5.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import json, os
3
+
4
+ with open("/home/mshahidul/readctrl/prompts/subclaims_extraction_vali.txt", "r") as f:
5
+ prompt_template = f.read()
6
+
7
+
8
+ api_file = "/home/mshahidul/api_new.json"
9
+ with open(api_file, "r") as f:
10
+ api_keys = json.load(f)
11
+ openai_api_key = api_keys["openai"]
12
+
13
+ client = OpenAI(api_key=openai_api_key)
14
+
15
+
16
+ def openai_return(prompt, model="gpt-5"):
17
+ """Send a prompt to GPT and parse JSON."""
18
+ response = client.chat.completions.create(
19
+ model=model,
20
+ messages=[
21
+ {"role": "system", "content": "You are a helpful assistant."},
22
+ {"role": "user", "content": prompt}
23
+ ]
24
+ )
25
+ content = response.choices[0].message.content.strip()
26
+ cleaned = content.replace("```json", "").replace("```", "").strip()
27
+ try:
28
+ return json.loads(cleaned)
29
+ except json.JSONDecodeError:
30
+ print("⚠️ JSON parse failed — storing raw text.")
31
+ return cleaned
32
+
33
+ with open("/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_full_data.json", "r") as f:
34
+ data = json.load(f)
35
+
36
+ save_path="/home/mshahidul/readctrl/data/model_validity_check/subclaims_validity_check_v1.json"
37
+ res=[]
38
+ if os.path.exists(save_path):
39
+ with open(save_path, "r") as f:
40
+ res = json.load(f)
41
+ import tqdm
42
+ for i in tqdm.tqdm(range(5)):
43
+ for label in ["easy", "intermediate", "hard"]:
44
+ new_prompt = prompt_template.replace("<<<TEXT>>>",data[i][f"{label}_text"]).replace("<<<SUBCLAIMS>>>", json.dumps(data[i][f"{label}_subclaims"], indent=2, ensure_ascii=False))
45
+ # import ipdb; ipdb.set_trace()
46
+ sample = openai_return(new_prompt, model="gpt-5")
47
+
48
+ res.append(sample)
49
+ if len(res) % 2 == 0:
50
+ with open(save_path, "w") as f:
51
+ json.dump(res, f, indent=2, ensure_ascii=False)
52
+ print(f"Saved {len(res)} samples so far.")
53
+
54
+ with open(save_path, "w") as f:
55
+ json.dump(res, f, indent=2, ensure_ascii=False)
56
+