File size: 54,303 Bytes
c7a6fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2780f69",
   "metadata": {},
   "outputs": [],
   "source": [
    "ALL_PROMPTS = {\n",
    "    \"en\": {\n",
    "        \"B1\": \"\"\"You are a summarization assistant. Your single most important goal is to rewrite medical text for a first-grade reading level (ages 5-7, FKGL 1.0-4.0). Simplicity is more important than detail.\n",
    "\n",
    "Core Mandate:\n",
    "- TARGET AUDIENCE: A 6-year-old child.\n",
    "- PRIMARY GOAL: Extreme simplicity. If you must choose between accuracy of detail and simplicity, ALWAYS choose simplicity.\n",
    "\n",
    "Strict Rules You Must Follow:\n",
    "- SENTENCE LENGTH: Keep almost all sentences under 10 words. Use very short, simple sentences.\n",
    "- VOCABULARY: Use only very common, everyday words that a first-grader would know. Avoid any medical or scientific terms. Instead of 'femur', say 'thigh bone'. Instead of 'benign', say 'not harmful'.\n",
    "- TONE: Be very gentle, calm, and reassuring. Like a kind doctor explaining something to a small child.\n",
    "- STRUCTURE: Use short paragraphs, often just one or two sentences long.\n",
    "- FOCUS: Only mention the most important one or two points from the original text. Omit all other details.\n",
    "\n",
    "- Never use emojis.\n",
    "- Do not explain pronunciation.\n",
    "- DO NOT use any medical jargon.\n",
    "\"\"\",\n",
    "    \"B2\": \"\"\"You are a summarization assistant trained to rewrite medical summaries for a middle school reading level (ages 1114, FKGL 6.09.0). Your goal is clarity for a teenager with a basic understanding of biology.\n",
    "\n",
    "Core Mandate:\n",
    "- TARGET AUDIENCE: A 14-year-old in a 9th-grade biology class.\n",
    "- PRIMARY GOAL: Clarity and straightforward explanation.\n",
    "\n",
    "Strict Rules You Must Follow:\n",
    "- SENTENCE LENGTH: Vary sentence length, but aim for an average of 12-18 words. Avoid long, complex sentences.\n",
    "- VOCABULARY: You can use basic medical terms (e.g., 'biopsy', 'cells', 'tumor'), but you MUST explain them in simple terms immediately. For example: \"A biopsy, which is when a small piece of tissue is taken for testing...\".\n",
    "- TONE: Be empathetic but direct. Use an educational and informative tone, like a science teacher.\n",
    "- STRUCTURE: Organize the summary into logical paragraphs. You can use simple headings if it helps clarity (e.g., \"What They Found,\" \"What It Means\").\n",
    "- FOCUS: Summarize the main findings and their implications. Omit minor or highly technical details.\n",
    "\n",
    "- Never use emojis.\n",
    "- Do not explain pronunciation.\n",
    "\"\"\",\n",
    "    \"B3\": \"\"\"You are a summarization assistant trained to rewrite medical summaries for an educated, non-medical adult (ages 17+, FKGL 12.0+). Your goal is to be precise, comprehensive, and clear for a college-level reader.\n",
    "\n",
    "Core Mandate:\n",
    "- TARGET AUDIENCE: A curious college student or adult with no medical training.\n",
    "- PRIMARY GOAL: Precision and structured clarity.\n",
    "\n",
    "Strict Rules You Must Follow:\n",
    "- SENTENCE LENGTH: Use clear, well-constructed sentences. Complex sentences are acceptable if they enhance clarity and precision.\n",
    "- VOCABULARY: Use correct medical terminology. You can assume the reader can understand terms from context or look them up, but for very specialized terms, provide a brief parenthetical explanation. For example: \"...showed evidence of hyperplasia (an increase in the number of cells).\"\n",
    "- TONE: Maintain a professional, empathetic, and respectful tone. Be authoritative but not clinical or cold.\n",
    "- STRUCTURE: Provide a detailed and structured summary. Use headings to organize information, such as \"Background,\" \"Key Findings,\" \"Clinical Interpretation,\" and \"Next Steps.\"\n",
    "- FOCUS: Be comprehensive and faithful to the source summary. Include important details, test results, and differential diagnoses mentioned in the source.\n",
    "\n",
    "- Never use emojis.\n",
    "- Do not explain pronunciation.\n",
    "\"\"\"\n",
    "    },\n",
    "    \"es\": {\n",
    "        \"B1\": \"\"\"Eres un asistente de resumen. Tu único y más importante objetivo es reescribir texto médico para un nivel de lectura de primer grado (edades 5-7). La simplicidad es más importante que el detalle.\n",
    "\n",
    "Mandato Principal:\n",
    "- PÚBLICO OBJETIVO: Un niño de 6 años.\n",
    "- OBJETIVO PRIMARIO: Simplicidad extrema. Si debes elegir entre la precisión del detalle y la simplicidad, SIEMPRE elige la simplicidad.\n",
    "\n",
    "Reglas Estrictas que Debes Seguir:\n",
    "- IDIOMA: El resumen DEBE estar escrito en español.\n",
    "- LONGITUD DE LA ORACIÓN: Casi todas las oraciones deben tener menos de 10 palabras. Usa frases muy cortas y simples.\n",
    "- VOCABULARIO: Usa solo palabras cotidianas y muy comunes que un niño de primer grado conocería. Evita cualquier término médico o científico. En lugar de 'fémur', di 'hueso del muslo'. En lugar de 'benigno', di 'que no es dañino'.\n",
    "- TONO: Sé muy gentil, calmado y tranquilizador. Como un doctor amable explicándole algo a un niño pequeño.\n",
    "- ESTRUCTURA: Usa párrafos cortos, a menudo de solo una o dos oraciones.\n",
    "- ENFOQUE: Menciona solo el punto más importante o los dos puntos más importantes del texto original. Omite todos los demás detalles.\n",
    "\n",
    "- Nunca uses emojis.\n",
    "- No expliques la pronunciación.\n",
    "- NO uses jerga médica.\n",
    "\"\"\",\n",
    "        \"B2\": \"\"\"Eres un asistente de resumen entrenado para reescribir resúmenes médicos para un nivel de lectura de secundaria (edades 1114). Tu objetivo es la claridad para un adolescente con conocimientos básicos de biología.\n",
    "\n",
    "Mandato Principal:\n",
    "- PÚBLICO OBJETIVO: Un estudiante de 14 años en una clase de biología de secundaria.\n",
    "- OBJETIVO PRIMARIO: Claridad y explicación directa.\n",
    "\n",
    "Reglas Estrictas que Debes Seguir:\n",
    "- IDIOMA: El resumen DEBE estar escrito en español.\n",
    "- LONGITUD DE LA ORACIÓN: Varía la longitud de las oraciones, pero busca un promedio de 12-18 palabras. Evita las oraciones largas y complejas.\n",
    "- VOCABULARIO: Puedes usar términos médicos básicos (ej., 'biopsia', 'células', 'tumor'), pero DEBES explicarlos en términos sencillos inmediatamente. Por ejemplo: \"Una biopsia, que es cuando se toma un pequeño trozo de tejido para analizarlo...\".\n",
    "- TONO: Sé empático pero directo. Usa un tono educativo e informativo, como un profesor de ciencias.\n",
    "- ESTRUCTURA: Organiza el resumen en párrafos lógicos. Puedes usar encabezados simples si ayuda a la claridad (ej., \"Lo que Encontraron,\" \"Qué Significa\").\n",
    "- ENFOQUE: Resume los hallazgos principales y sus implicaciones. Omite detalles menores o muy técnicos.\n",
    "\n",
    "- Nunca uses emojis.\n",
    "- No expliques la pronunciación.\n",
    "\"\"\",\n",
    "        \"B3\": \"\"\"Eres un asistente de resumen entrenado para reescribir resúmenes médicos para un adulto educado no médico (edades 17+). Tu objetivo es ser preciso, completo y claro para un lector de nivel universitario.\n",
    "\n",
    "Mandato Principal:\n",
    "- PÚBLICO OBJETIVO: Un estudiante universitario o un adulto curioso sin formación médica.\n",
    "- OBJETIVO PRIMARIO: Precisión y claridad estructurada.\n",
    "\n",
    "Reglas Estrictas que Debes Seguir:\n",
    "- IDIOMA: El resumen DEBE estar escrito en español.\n",
    "- LONGITUD DE LA ORACIÓN: Usa oraciones claras y bien construidas. Las oraciones complejas son aceptables si mejoran la claridad y la precisión.\n",
    "- VOCABULARIO: Usa la terminología médica correcta. Puedes asumir que el lector puede entender los términos por el contexto o buscarlos, pero para términos muy especializados, proporciona una breve explicación entre paréntesis. Por ejemplo: \"...mostró evidencia de hiperplasia (un aumento en el número de células).\"\n",
    "- TONO: Mantén un tono profesional, empático y respetuoso. Sé autoritario pero no clínico o frío.\n",
    "- ESTRUCTURA: Proporciona un resumen detallado y estructurado. Usa encabezados para organizar la información, como \"Contexto,\" \"Hallazgos Clave,\" \"Interpretación Clínica,\" y \"Próximos Pasos.\"\n",
    "- ENFOQUE: Sé completo y fiel al resumen original. Incluye detalles importantes, resultados de pruebas y diagnósticos diferenciales mencionados en la fuente.\n",
    "\n",
    "- Nunca uses emojis.\n",
    "- No expliques la pronunciación.\n",
    "\"\"\"\n",
    "    },\n",
    "\"fr\": {\n",
    "    \"B1\": \"\"\"Vous êtes un assistant de résumé. Votre unique et plus important objectif est de réécrire un texte médical pour un niveau de lecture de cours préparatoire (âges 5-7). La simplicité est plus importante que le détail.\n",
    "\n",
    "Mandat Principal :\n",
    "- PUBLIC CIBLE : Un enfant de 6 ans.\n",
    "- OBJECTIF PRINCIPAL : Simplicité extrême. Si vous devez choisir entre la précision des détails et la simplicité, choisissez TOUJOURS la simplicité.\n",
    "\n",
    "Règles Strictes à Suivre Impérativement :\n",
    "- LANGUE : Le résumé DOIT être rédigé en français.\n",
    "- LONGUEUR DES PHRASES : Presque toutes les phrases doivent faire moins de 10 mots. Utilisez des phrases très courtes et simples.\n",
    "- VOCABULAIRE : Utilisez uniquement des mots très courants et quotidiens qu'un enfant de cet âge connaîtrait. Évitez tout terme médical ou scientifique. Au lieu de 'fémur', dites 'l'os de la cuisse'. Au lieu de 'bénin', dites 'pas dangereux'.\n",
    "- TON : Soyez très doux, calme et rassurant. Comme un médecin bienveillant qui explique quelque chose à un jeune enfant.\n",
    "- STRUCTURE : Utilisez des paragraphes courts, souvent composés d'une ou deux phrases seulement.\n",
    "- ENFOQUE : Mentionnez uniquement le ou les deux points les plus importants du texte original. Omettez tous les autres détails.\n",
    "\n",
    "- N'utilisez jamais d'emojis.\n",
    "- N'expliquez pas la prononciation.\n",
    "- N'utilisez AUCUN jargon médical.\n",
    "\"\"\",\n",
    "    \"B2\": \"\"\"Vous êtes un assistant de résumé entraîné à réécrire des résumés médicaux pour un niveau de lecture de collège (âges 11–14). Votre objectif est la clarté pour un adolescent ayant une compréhension de base de la biologie.\n",
    "\n",
    "Mandat Principal :\n",
    "- PUBLIC CIBLE : Un adolescent de 14 ans en classe de biologie au collège.\n",
    "- OBJECTIF PRINCIPAL : Clarté et explication directe.\n",
    "\n",
    "Règles Strictes à Suivre Impérativement :\n",
    "- LANGUE : Le résumé DOIT être rédigé en français.\n",
    "- LONGUEUR DES PHRASES : Variez la longueur des phrases, mais visez une moyenne de 12-18 mots. Évitez les phrases longues et complexes.\n",
    "- VOCABULAIRE : Vous pouvez utiliser des termes médicaux de base (ex: 'biopsie', 'cellules', 'tumeur'), mais vous DEVEZ les expliquer en termes simples immédiatement. Par exemple : \"Une biopsie, c'est-à-dire quand on prélève un petit morceau de tissu pour l'analyser...\".\n",
    "- TON : Soyez empathique mais direct. Adoptez un ton pédagogique et informatif, comme un professeur de sciences.\n",
    "- STRUCTURE : Organisez le résumé en paragraphes logiques. Vous pouvez utiliser des titres simples si cela améliore la clarté (ex: \"Ce qu'ils ont trouvé\", \"Ce que cela signifie\").\n",
    "- ENFOQUE : Résumez les principales observations et leurs implications. Omettez les détails mineurs ou très techniques.\n",
    "\n",
    "- N'utilisez jamais d'emojis.\n",
    "- N'expliquez pas la prononciation.\n",
    "\"\"\",\n",
    "    \"B3\": \"\"\"Vous êtes un assistant de résumé entraîné à réécrire des résumés médicaux pour un adulte éduqué non-médecin (âges 17+). Votre objectif est d'être précis, complet et clair pour un lecteur de niveau universitaire.\n",
    "\n",
    "Mandat Principal :\n",
    "- PUBLIC CIBLE : Un étudiant ou un adulte curieux sans formation médicale.\n",
    "- OBJECTIF PRINCIPAL : Précision et clarté structurée.\n",
    "\n",
    "Règles Strictes à Suivre Impérativement :\n",
    "- LANGUE : Le résumé DOIT être rédigé en français.\n",
    "- LONGUEUR DES PHRASES : Utilisez des phrases claires et bien construites. Les phrases complexes sont acceptables si elles améliorent la clarté et la précision.\n",
    "- VOCABULAIRE : Utilisez la terminologie médicale correcte. Vous pouvez supposer que le lecteur peut comprendre les termes par le contexte ou les rechercher, mais pour les termes très spécialisés, fournissez une brève explication entre parenthèses. Par exemple : \"...montrait des signes d'hyperplasie (une augmentation du nombre de cellules).\"\n",
    "- TON : Maintenez un ton professionnel, empathique et respectueux. Soyez directif mais ni clinique ni froid.\n",
    "- STRUCTURE : Fournissez un résumé détaillé et structuré. Utilisez des titres pour organiser l'information, tels que \"Contexte\", \"Principales Observations\", \"Interprétation Clinique\" et \"Prochaines Étapes\".\n",
    "- ENFOQUE : Soyez complet et fidèle au résumé source. Incluez les détails importants, les résultats des tests et les diagnostics différentiels mentionnés dans la source.\n",
    "\n",
    "- N'utilisez jamais d'emojis.\n",
    "- N'expliquez pas la prononciation.\n",
    "\"\"\"\n",
    "},\n",
    "\n",
    "\"pt\": {\n",
    "    \"B1\": \"\"\"Você é um assistente de resumo. O seu único e mais importante objetivo é reescrever textos médicos para um nível de leitura da primeira série (idades 5-7). A simplicidade é mais importante que os detalhes.\n",
    "\n",
    "Mandato Principal:\n",
    "- PÚBLICO-ALVO: Uma criança de 6 anos.\n",
    "- OBJETIVO PRINCIPAL: Simplicidade extrema. Se tiver que escolher entre a precisão dos detalhes e a simplicidade, ESCOLHA SEMPRE a simplicidade.\n",
    "\n",
    "Regras Rígidas que Você Deve Seguir:\n",
    "- IDIOMA: O resumo DEVE ser escrito em português.\n",
    "- COMPRIMENTO DAS FRASES: Quase todas as frases devem ter menos de 10 palavras. Use frases muito curtas e simples.\n",
    "- VOCABULÁRIO: Use apenas palavras quotidianas e muito comuns que uma criança da primeira série conheceria. Evite qualquer termo médico ou científico. Em vez de 'fêmur', diga 'o osso da coxa'. Em vez de 'benigno', diga 'que não faz mal'.\n",
    "- TOM: Seja muito gentil, calmo e tranquilizador. Como um médico amável a explicar algo a uma criança pequena.\n",
    "- ESTRUTURA: Use parágrafos curtos, muitas vezes com apenas uma ou duas frases.\n",
    "- FOCO: Mencione apenas um ou dois dos pontos mais importantes do texto original. Omita todos os outros detalhes.\n",
    "\n",
    "- Nunca use emojis.\n",
    "- Não explique a pronúncia.\n",
    "- NÃO use NENHUM jargão médico.\n",
    "\"\"\",\n",
    "    \"B2\": \"\"\"Você é um assistente de resumo treinado para reescrever resumos médicos para um nível de leitura do ensino fundamental II (idades 11–14). O seu objetivo é a clareza para um adolescente com conhecimentos básicos de biologia.\n",
    "\n",
    "Mandato Principal:\n",
    "- PÚBLICO-ALVO: Um adolescente de 14 anos numa aula de biologia.\n",
    "- OBJETIVO PRINCIPAL: Clareza e explicação direta.\n",
    "\n",
    "Regras Rígidas que Você Deve Seguir:\n",
    "- IDIOMA: O resumo DEVE ser escrito em português.\n",
    "- COMPRIMENTO DAS FRASES: Varie o comprimento das frases, mas procure uma média de 12 a 18 palavras. Evite frases longas e complexas.\n",
    "- VOCABULÁRIO: Pode usar termos médicos básicos (ex: 'biópsia', 'células', 'tumor'), mas você DEVE explicá-los em termos simples imediatamente. Por exemplo: \"Uma biópsia, que é quando um pequeno pedaço de tecido é retirado para ser analisado...\".\n",
    "- TOM: Seja empático, mas direto. Use um tom educativo e informativo, como um professor de ciências.\n",
    "- ESTRUTURA: Organize o resumo em parágrafos lógicos. Pode usar títulos simples se isso ajudar na clareza (ex: \"O que eles encontraram\", \"O que isso significa\").\n",
    "- FOCO: Resuma os principais achados e as suas implicações. Omita detalhes menores ou muito técnicos.\n",
    "\n",
    "- Nunca use emojis.\n",
    "- Não explique a pronúncia.\n",
    "\"\"\",\n",
    "    \"B3\": \"\"\"Você é um assistente de resumo treinado para reescrever resumos médicos para um adulto instruído, mas sem formação médica (idades 17+). O seu objetivo é ser preciso, abrangente e claro para um leitor de nível universitário.\n",
    "\n",
    "Mandato Principal:\n",
    "- PÚBLICO-ALVO: Um estudante universitário ou adulto curioso sem formação médica.\n",
    "- OBJETIVO PRINCIPAL: Precisão e clareza estruturada.\n",
    "\n",
    "Regras Rígidas que Você Deve Seguir:\n",
    "- IDIOMA: O resumo DEVE ser escrito em português.\n",
    "- COMPRIMENTO DAS FRASES: Use frases claras e bem construídas. Frases complexas são aceitáveis se melhorarem a clareza e a precisão.\n",
    "- VOCABULÁRIO: Use a terminologia médica correta. Pode assumir que o leitor consegue entender os termos pelo contexto ou pesquisá-los, mas para termos muito especializados, forneça uma breve explicação entre parênteses. Por exemplo: \"...mostrou evidência de hiperplasia (um aumento no número de células).\"\n",
    "- TOM: Mantenha um tom profissional, empático e respeitoso. Seja confiante, mas não clínico ou frio.\n",
    "- ESTRUTURA: Forneça um resumo detalhado e estruturado. Use títulos para organizar a informação, como \"Contexto\", \"Principais Achados\", \"Interpretação Clínica\" e \"Próximos Passos\".\n",
    "- FOCO: Seja abrangente e fiel ao resumo original. Inclua detalhes importantes, resultados de testes e diagnósticos diferenciais mencionados na fonte.\n",
    "\n",
    "- Nunca use emojis.\n",
    "- Não explique a pronúncia.\n",
    "\"\"\"\n",
    "}\n",
    "\n",
    "}\n",
    "USER_PROMPT_TEMPLATES = {\n",
    "    \"en\": \"\"\"Please rewrite the following expert summary for the specified target audience. Use the full article for context if needed.\n",
    "**Full Article Context:**\n",
    "{article}\n",
    "**Expert Summary to Rewrite:**\n",
    "{gold_summary}\n",
    "\"\"\",\n",
    "    \"es\": \"\"\"Por favor, reescribe el siguiente resumen de experto para el público objetivo especificado. Usa el artículo completo como contexto si es necesario.\n",
    "**Contexto del Artículo Completo:**\n",
    "{article}\n",
    "**Resumen de Experto a Reescribir:**\n",
    "{gold_summary}\n",
    "\"\"\",\n",
    "    \"fr\": \"\"\"Veuillez réécrire le résumé d'expert suivant pour le public cible spécifié. Utilisez l'article complet comme contexte si nécessaire.\n",
    "**Contexte de l'Article Complet :**\n",
    "{article}\n",
    "**Résumé d'Expert à Réécrire :**\n",
    "{gold_summary}\n",
    "\"\"\",\n",
    "    \"pt\": \"\"\"Por favor, reescreva o seguinte resumo de especialista para o público-alvo especificado. Use o artigo completo como contexto, se necessário.\n",
    "**Contexto do Artigo Completo:**\n",
    "{article}\n",
    "**Resumo do Especialista a Ser Reescrito:**\n",
    "{gold_summary}\n",
    "\"\"\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bb9ee67",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e40397cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "lang=\"es\"\n",
    "with open('/home/mshahidul/readctrl/generating_data/tik_ache/es_syntheticV3.json', 'r', encoding='utf-8') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "converted = []\n",
    "prompts_for_lang = ALL_PROMPTS.get(lang)\n",
    "user_prompt_template = USER_PROMPT_TEMPLATES.get(lang)\n",
    "for msg in data:\n",
    "    conversation={}\n",
    "    for key in msg['synthetic_summary'].keys():\n",
    "        system_prompt = prompts_for_lang[key]\n",
    "        sys_msg=msg['synthetic_summary'][key]\n",
    "        user_prompt = user_prompt_template.format(article=msg['article'], gold_summary=msg['gold_summary'])\n",
    "        conversation['conversations']= (\n",
    "            {'from': \"human\", 'content': system_prompt+'\\n'+user_prompt},\n",
    "            {'from': \"gpt\", 'content': sys_msg},\n",
    "        )\n",
    "        converted.append(conversation)\n",
    "\n",
    "# Save or print the result\n",
    "with open(f'/home/mshahidul/readctrl/data_train/{lang}_train.json', 'w', encoding='utf-8') as f:\n",
    "    json.dump(converted, f, ensure_ascii=False, indent=2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4373e6c",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/home/mshahidul/readctrl/data_train/es_train.json', 'r', encoding='utf-8') as f:\n",
    "    es_data = json.load(f)\n",
    "print(es_data[0]['conversations'][1]['content'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e8e1d2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_prompt(article, gold_summary, band, lang):\n",
    "    \"\"\"Call an OpenAI model to generate a synthetic summary for a given readability band and language.\"\"\"\n",
    "    prompts_for_lang = ALL_PROMPTS.get(lang)\n",
    "    user_prompt_template = USER_PROMPT_TEMPLATES.get(lang)\n",
    "    if not prompts_for_lang or not user_prompt_template:\n",
    "        raise ValueError(f\"No prompts available for language: {lang}\")\n",
    "    \n",
    "    system_prompt = prompts_for_lang[band]\n",
    "    user_prompt = user_prompt_template.format(article=article, gold_summary=gold_summary)\n",
    "    return system_prompt + \"\\n\" + user_prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddb14cb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "lang=\"es\"\n",
    "with open('/home/mshahidul/readctrl/generating_data/tik_ache/es_syntheticV3.json', 'r', encoding='utf-8') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "converted = []\n",
    "prompts_for_lang = ALL_PROMPTS.get(lang)\n",
    "user_prompt_template = USER_PROMPT_TEMPLATES.get(lang)\n",
    "for msg in data:\n",
    "    for key in msg['synthetic_summary'].keys():\n",
    "        conversation={}\n",
    "        system_prompt = prompts_for_lang[key]\n",
    "        sys_msg=msg['synthetic_summary'][key]\n",
    "        user_prompt = user_prompt_template.format(article=msg['article'], gold_summary=msg['gold_summary'])\n",
    "        conversation['conversations']= (\n",
    "            {'from': \"human\", 'content': system_prompt+'\\n'+user_prompt},\n",
    "            {'from': \"gpt\", 'content': sys_msg},\n",
    "        )\n",
    "        converted.append(conversation)\n",
    "\n",
    "# Save or print the result\n",
    "with open(f'/home/mshahidul/readctrl/data_train/{lang}_train.json', 'w', encoding='utf-8') as f:\n",
    "    json.dump(converted, f, ensure_ascii=False, indent=2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b82bd543",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "with open('/home/mshahidul/readctrl/synthetic_data_es_raw/0.json', 'r', encoding='utf-8') as f:\n",
    "    raw_es_data = json.load(f)\n",
    "print(f\"easy:- {raw_es_data['readability_versions']['easy']['text']}\")\n",
    "print(f\"intermediate:- {raw_es_data['readability_versions']['intermediate']['text']}\")\n",
    "print(f\"hard:- {raw_es_data['readability_versions']['hard']['text']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aca0ef62",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "raw_dir = '/home/mshahidul/readctrl/synthetic_data_es_raw'\n",
    "raw_files = [f for f in os.listdir(raw_dir) if f.endswith('.json')]\n",
    "\n",
    "raw_data_list = []\n",
    "for fname in raw_files:\n",
    "    with open(os.path.join(raw_dir, fname), 'r', encoding='utf-8') as f:\n",
    "        raw_data_list.append(json.load(f))\n",
    "\n",
    "print(f\"Loaded {len(raw_data_list)} files from {raw_dir}\")\n",
    "with open('/home/mshahidul/readctrl/data/hand_create_gpt5/es_rawV1.json', 'w', encoding='utf-8') as f:\n",
    "    json.dump(raw_data_list, f, ensure_ascii=False, indent=4)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c6d8fb6",
   "metadata": {},
   "source": [
    "## dataset modified for training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0899cccb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "prompts={\n",
    "\"easy\":'''\n",
    "You are an assistant that rewrites Spanish texts to make them very simple and easy to understand.\n",
    "Your goal is to rewrite the provided input text for younger readers (Fernández Huerta 70100; grade 57).\n",
    "Use short sentences, simple words, and friendly tone. Avoid technical or complex expressions.\n",
    "Keep all important factual details, but remove jargon.\n",
    "Return only the rewritten text without commentary.\n",
    "''',\n",
    "\n",
    "'intermediate':'''\n",
    "You are an assistant specialized in rewriting Spanish texts with medium readability.\n",
    "Your task is to rewrite the provided input text for general or high‑school‑level readers (Fernández Huerta 5070; grade 812).\n",
    "Use clear and complete sentences, moderately complex vocabulary, and structured narration.\n",
    "Retain all relevant medical or factual information, but phrase it in accessible language.\n",
    "Return only the rewritten text with no explanations.\n",
    "''',\n",
    "\n",
    "'hard':'''\n",
    "You are an assistant that rewrites Spanish medical texts with professional, technical precision.\n",
    "Rewrite the following input text using specialized, academic terminology and information‑dense phrasing.\n",
    "The output must target a Fernández Huerta readability index between 0 and 50 (university/professional level).\n",
    "Use clinical vocabulary, formal register, and detailed description of pathophysiology, procedures, and findings.\n",
    "Return only the rewritten text.\n",
    "'''\n",
    "}\n",
    "with open('/home/mshahidul/readctrl/data/hand_create_gpt5/es_rawV1.json', 'r', encoding='utf-8') as f:\n",
    "    gpt5_syn_es = json.load(f)\n",
    "gpt5_syn_es[0]\n",
    "import json\n",
    "\n",
    "with open('/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_es.json', 'r', encoding='utf-8') as f:\n",
    "    test_data = json.load(f)\n",
    "\n",
    "def full_text(id):\n",
    "    for item in test_data:\n",
    "        if item['id'] == id:\n",
    "            return item['fulltext']\n",
    "    return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38186215",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9ce8569",
   "metadata": {},
   "outputs": [],
   "source": [
    "converted = []\n",
    "cnt=0\n",
    "for item in gpt5_syn_es:\n",
    "    readability_data=item['readability_versions']\n",
    "    fulltext=full_text(item['id'])\n",
    "    for band, band_data in readability_data.items():\n",
    "        conversation={}\n",
    "        system_prompt=prompts[band]\n",
    "        conversation['conversations']= (\n",
    "                {'from': \"human\", 'content': system_prompt+'\\n\\n'+\"Input text:\\n\"+fulltext},\n",
    "                {'from': \"gpt\", 'content': band_data['text']},\n",
    "            )\n",
    "        converted.append(conversation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52be9a01",
   "metadata": {},
   "outputs": [],
   "source": [
    "# [{'content': 'reasoning language: French\\n\\nYou are an AI chatbot with a lively and energetic personality.',\n",
    "#   'role': 'system',\n",
    "#   'thinking': None},\n",
    "#  {'content': 'Can you show me the latest trends on Twitter right now?',\n",
    "#   'role': 'user',\n",
    "#   'thinking': None},\n",
    "#  {'content': 'Hey there!  While I can\\'t check Twitter (X) in real-time or access live data, I can share some tips to help you spot the latest trends:\\n\\n1. **Open the \"Trending\" tab** on the Twitter app or website – it updates constantly!  \\n2. **Search for hashtags** like #Trending or #Viral to see what’s blowing up.  \\n3. **Follow accounts** that curate trends (e.g., @TrendingNow, @ViralThreads).  \\n4. **Check regional trends** – they often differ by location!  \\n\\nRemember, trends are *super fast-moving* and often tied to pop culture, memes, or breaking news. For example, recent trends have included viral challenges (like the \"Distracted Boyfriend\" meme revival), celebrity drama, or unexpected events (hello, weather disasters!).  \\n\\nWant me to brainstorm *what* might trend next? I’ve got ideas!',\n",
    "#   'role': 'assistant',\n",
    "#   'thinking': \"D'accord, l'utilisateur demande les tendances Twitter les plus récentes. Tout d'abord, je dois vérifier si j'ai accès à des données en temps réel. Étant donné que je ne peux pas naviguer sur Internet ou accéder directement à l'API de Twitter, je ne peux pas fournir des tendances en direct. Cependant, je peux donner quelques conseils généraux sur la façon de les trouver.\\n\\nJe devrais préciser que les tendances Twitter évoluent rapidement et sont spécifiques à chaque région. Je pourrais suggérer de consulter la section «\\xa0En vogue\\xa0» sur l'application ou le site web. Aussi, l'utilisation de hashtags et le suivi d'utilisateurs pertinents pourraient être utiles. Il est important de souligner que les tendances varient selon la région et l'heure de la journée. Je devrais garder un ton amical et bienveillant, peut-être ajouter un emoji pour rester léger. Je vais structurer ma réponse étape par étape pour faciliter la lecture. Je dois m'excuser de ne pas pouvoir fournir des données en temps réel et proposer d'autres méthodes. Je conserverai un langage simple et convivial, en évitant les termes techniques.\"}]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a71fdf6",
   "metadata": {},
   "outputs": [],
   "source": [
    "converted = []\n",
    "cnt=0\n",
    "for item in gpt5_syn_es:\n",
    "    readability_data=item['readability_versions']\n",
    "    fulltext=full_text(item['id'])\n",
    "    for band, band_data in readability_data.items():\n",
    "        conversation={}\n",
    "        system_prompt=prompts[band]\n",
    "        conversation['messages']= (\n",
    "                {'role': \"system\", 'content': system_prompt, 'thinking': None},\n",
    "                {'role': \"user\", 'content': \"Input text:\\n\"+fulltext, 'thinking': None},\n",
    "                {'role': \"assistant\", 'content': band_data['text'], 'thinking': None},\n",
    "            )\n",
    "        converted.append(conversation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f173809",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c20a9f4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(f'/home/mshahidul/readctrl/data/hand_create_gpt5/es_trainV1.json', 'w', encoding='utf-8') as f:\n",
    "    json.dump(converted, f, ensure_ascii=False, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "975d8e1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyphen\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# Initialize Spanish syllable dictionary\n",
    "dic = pyphen.Pyphen(lang='es')\n",
    "\n",
    "# --- FH Score Functions ---\n",
    "def count_syllables(word):\n",
    "    hyphenated = dic.inserted(word)\n",
    "    return len(hyphenated.split('-'))\n",
    "\n",
    "def huerta_score(text):\n",
    "    \"\"\"\n",
    "    Compute the Fernández Huerta readability score for Spanish text.\n",
    "    FH = 206.84 - 60 * (Syllables per Word) - 1.02 * (Words per Sentence)\n",
    "    \"\"\"\n",
    "    sentences = [s for s in text.split('.') if s.strip()]\n",
    "    words = [w for w in text.split() if w.isalpha()]\n",
    "    if not words or not sentences:\n",
    "        return 0.0\n",
    "    total_syllables = sum(count_syllables(word.lower()) for word in words)\n",
    "    avg_syllables_per_word = total_syllables / len(words)\n",
    "    avg_sentence_length = len(words) / len(sentences)\n",
    "    score = 206.84 - 60 * avg_syllables_per_word - 1.02 * avg_sentence_length\n",
    "    return round(score, 2)\n",
    "\n",
    "# --- Plotting Function ---\n",
    "def plot_fh_scores(text_list):\n",
    "    scores = [huerta_score(t) for t in text_list]\n",
    "    indices = list(range(len(text_list)))\n",
    "\n",
    "    plt.figure(figsize=(10, 5))\n",
    "    plt.plot(indices, scores, 'ko', label='FH Score')\n",
    "    plt.axhspan(70, 100, color='green', alpha=0.1, label='Easy (70-100)')\n",
    "    plt.axhspan(50, 70, color='blue', alpha=0.1, label='Intermediate (50-70)')\n",
    "    plt.axhspan(0, 50, color='red', alpha=0.1, label='Hard (0-50)')\n",
    "    plt.xlabel('Text Index')\n",
    "    plt.ylabel('Fernández Huerta Score')\n",
    "    plt.title('Fernández Huerta Readability Scores')\n",
    "    plt.legend()\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "    # Also print results\n",
    "    for i, s in enumerate(scores):\n",
    "        print(f\"Text {i}: FH Score = {s}\")\n",
    "\n",
    "        # Example: Compute FH score for the \"hard\" band_data text\n",
    "        hard_text = band_data['text']\n",
    "        hard_score = huerta_score(hard_text)\n",
    "        print(f'Fernández Huerta score for \"hard\" band: {hard_score}')\n",
    "# --- Example Usage ---\n",
    "# texts = [\n",
    "#     \"Este es un texto muy simple y fácil de leer. Las oraciones son cortas.\",\n",
    "#     \"El presente documento aborda temas complejos relacionados con la neurociencia cognitiva y su aplicación en sistemas computacionales.\",\n",
    "#     \"El perro corre rápido. Juega con la pelota. Se divierte mucho.\"\n",
    "# ]\n",
    "\n",
    "# plot_fh_scores(texts)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "804a3d10",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "test_en_path = '/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_en.json'\n",
    "with open(test_en_path, 'r', encoding='utf-8') as f:\n",
    "    test_en_data = json.load(f)\n",
    "\n",
    "print(f\"Loaded {len(test_en_data)} items from {test_en_path}\")\n",
    "print(test_en_data[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a230d18",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "e372abbf",
   "metadata": {},
   "source": [
    "## Model accuracy check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1190eb4b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------\n",
      "temp0.1_qwen3-14B_base_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 49, 'intermediate': 14, 'hard': 9}\n",
      "easy: 98.00%, intermediate: 28.00%, hard: 18.00%\n",
      "--------------------------------------------------\n",
      "temp0.3_qwen3-14B_base_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 48, 'intermediate': 15, 'hard': 10}\n",
      "easy: 96.00%, intermediate: 30.00%, hard: 20.00%\n",
      "--------------------------------------------------\n",
      "temp0.5_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 37, 'intermediate': 32, 'hard': 17}\n",
      "easy: 74.00%, intermediate: 64.00%, hard: 34.00%\n",
      "--------------------------------------------------\n",
      "temp1.3_qwen3-14B_base_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 46, 'intermediate': 25, 'hard': 24}\n",
      "easy: 92.00%, intermediate: 50.00%, hard: 48.00%\n",
      "--------------------------------------------------\n",
      "temp1.1_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 40, 'intermediate': 30, 'hard': 29}\n",
      "easy: 80.00%, intermediate: 60.00%, hard: 58.00%\n",
      "--------------------------------------------------\n",
      "temp1.0_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 43, 'intermediate': 32, 'hard': 18}\n",
      "easy: 86.00%, intermediate: 64.00%, hard: 36.00%\n",
      "--------------------------------------------------\n",
      "temp1.5_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 24, 'intermediate': 26, 'hard': 33}\n",
      "easy: 48.00%, intermediate: 52.00%, hard: 66.00%\n",
      "--------------------------------------------------\n",
      "temp1.3_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 29, 'intermediate': 38, 'hard': 29}\n",
      "easy: 58.00%, intermediate: 76.00%, hard: 58.00%\n",
      "--------------------------------------------------\n",
      "temp0.7_qwen3-14B_base_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 48, 'intermediate': 16, 'hard': 10}\n",
      "easy: 96.00%, intermediate: 32.00%, hard: 20.00%\n",
      "--------------------------------------------------\n",
      "temp0.5_qwen3-14B_base_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 48, 'intermediate': 20, 'hard': 9}\n",
      "easy: 96.00%, intermediate: 40.00%, hard: 18.00%\n",
      "--------------------------------------------------\n",
      "temp0.7_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 43, 'intermediate': 23, 'hard': 11}\n",
      "easy: 86.00%, intermediate: 46.00%, hard: 22.00%\n",
      "--------------------------------------------------\n",
      "temp1.4_qwen3-14B_base_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 48, 'intermediate': 27, 'hard': 26}\n",
      "easy: 96.00%, intermediate: 54.00%, hard: 52.00%\n",
      "--------------------------------------------------\n",
      "temp1.1_qwen3-14B_base_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 48, 'intermediate': 16, 'hard': 13}\n",
      "easy: 96.00%, intermediate: 32.00%, hard: 26.00%\n",
      "--------------------------------------------------\n",
      "temp1.4_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 28, 'intermediate': 27, 'hard': 30}\n",
      "easy: 56.00%, intermediate: 54.00%, hard: 60.00%\n",
      "--------------------------------------------------\n",
      "temp0.1_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 40, 'intermediate': 32, 'hard': 16}\n",
      "easy: 80.00%, intermediate: 64.00%, hard: 32.00%\n",
      "--------------------------------------------------\n",
      "temp1.2_qwen3-14B_base_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 48, 'intermediate': 20, 'hard': 28}\n",
      "easy: 96.00%, intermediate: 40.00%, hard: 56.00%\n",
      "--------------------------------------------------\n",
      "temp0.3_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 40, 'intermediate': 32, 'hard': 9}\n",
      "easy: 80.00%, intermediate: 64.00%, hard: 18.00%\n",
      "--------------------------------------------------\n",
      "temp1.5_qwen3-14B_base_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 47, 'intermediate': 20, 'hard': 33}\n",
      "easy: 94.00%, intermediate: 40.00%, hard: 66.00%\n",
      "--------------------------------------------------\n",
      "temp1.0_qwen3-14B_base_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 47, 'intermediate': 18, 'hard': 16}\n",
      "easy: 94.00%, intermediate: 36.00%, hard: 32.00%\n",
      "--------------------------------------------------\n",
      "temp1.2_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
      "{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
      "{'easy': 39, 'intermediate': 36, 'hard': 27}\n",
      "easy: 78.00%, intermediate: 72.00%, hard: 54.00%\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import pyphen\n",
    "import matplotlib.pyplot as plt\n",
    "band_ranges = {\n",
    "    \"easy\": (70, 100),        # Easy\n",
    "    \"intermediate\": (50, 70), # Intermediate\n",
    "    \"hard\": (0, 50)           # Hard\n",
    "}\n",
    "# Initialize Spanish syllable dictionary\n",
    "dic = pyphen.Pyphen(lang='es')\n",
    "\n",
    "# --- FH Score Functions ---\n",
    "def count_syllables(word):\n",
    "    hyphenated = dic.inserted(word)\n",
    "    return len(hyphenated.split('-'))\n",
    "\n",
    "def huerta_score(text):\n",
    "    \"\"\"\n",
    "    Compute the Fernández Huerta readability score for Spanish text.\n",
    "    FH = 206.84 - 60 * (Syllables per Word) - 1.02 * (Words per Sentence)\n",
    "    \"\"\"\n",
    "    sentences = [s for s in text.split('.') if s.strip()]\n",
    "    words = [w for w in text.split() if w.isalpha()]\n",
    "    if not words or not sentences:\n",
    "        return 0.0\n",
    "    total_syllables = sum(count_syllables(word.lower()) for word in words)\n",
    "    avg_syllables_per_word = total_syllables / len(words)\n",
    "    avg_sentence_length = len(words) / len(sentences)\n",
    "    score = 206.84 - 60 * avg_syllables_per_word - 1.02 * avg_sentence_length\n",
    "    return round(score, 2)\n",
    "def accuracy_check(path):\n",
    "    import json\n",
    "    texts=[]\n",
    "    accuracy_data = {'easy': 0, 'intermediate': 0, 'hard': 0}\n",
    "    num_each_band = {'easy': 0, 'intermediate': 0, 'hard': 0}\n",
    "    with open(path, 'r', encoding='utf-8') as f:\n",
    "        results_es = json.load(f)\n",
    "\n",
    "    for item in results_es:\n",
    "        dat=(item['synthetic_summary'].split(\"</think>\")[1].strip())\n",
    "        # print(item['band'])\n",
    "        band_data = item['band']\n",
    "        huerta_score_val = huerta_score(dat)\n",
    "        band_min, band_max = band_ranges[band_data]\n",
    "        if huerta_score_val >= band_min and huerta_score_val <= band_max:\n",
    "            accuracy_data[band_data] += 1\n",
    "        num_each_band[band_data] += 1\n",
    "    print(\"-\"*50)\n",
    "    print(f\"{os.path.basename(path)} accuracy results:\")\n",
    "    print(num_each_band)\n",
    "    print(accuracy_data)\n",
    "    print(f\"easy: {(accuracy_data['easy']/num_each_band['easy'])*100:.2f}%, intermediate: {(accuracy_data['intermediate']/num_each_band['intermediate'])*100:.2f}%, hard: {(accuracy_data['hard']/num_each_band['hard'])*100:.2f}%\")\n",
    "for ind in os.listdir(\"/home/mshahidul/readctrl/results/custom_promptsV1\"):\n",
    "    if ind.endswith('.json'):\n",
    "        accuracy_check(os.path.join(\"/home/mshahidul/readctrl/results/custom_promptsV1\", ind))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6534a993",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('/home/mshahidul/readctrl/data/hand_create_gpt5/es_trainV1.json', 'r', encoding='utf-8') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "print(len(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed98df6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pyphen\n",
    "import matplotlib.pyplot as plt\n",
    "from collections import defaultdict\n",
    "\n",
    "# === CONFIG ===\n",
    "root = \"/home/mshahidul/readctrl/data/hand_create_gpt5\"\n",
    "input_json = f\"{root}/es_rawV1.json\"\n",
    "output_json = f\"{root}/filtered_es_rawV1.json\"\n",
    "\n",
    "band_ranges = {\n",
    "    \"easy\": (70, 100),\n",
    "    \"intermediate\": (50, 70),\n",
    "    \"hard\": (0, 50)\n",
    "}\n",
    "\n",
    "# margin zone to remove texts near band boundaries\n",
    "margin = 5  # e.g., 67–70 near easy; 47–50 near intermediate\n",
    "\n",
    "# === FH Score Calculation ===\n",
    "dic = pyphen.Pyphen(lang='es')\n",
    "\n",
    "def count_syllables(word):\n",
    "    hyphenated = dic.inserted(word)\n",
    "    return len(hyphenated.split('-'))\n",
    "\n",
    "def huerta_score(text):\n",
    "    sentences = [s for s in text.split('.') if s.strip()]\n",
    "    words = [w for w in text.split() if w.isalpha()]\n",
    "    if not words or not sentences:\n",
    "        return 0.0\n",
    "    total_syllables = sum(count_syllables(word.lower()) for word in words)\n",
    "    avg_syllables_per_word = total_syllables / len(words)\n",
    "    avg_sentence_length = len(words) / len(sentences)\n",
    "    score = 206.84 - 60 * avg_syllables_per_word - 1.02 * avg_sentence_length\n",
    "    return round(score, 2)\n",
    "\n",
    "# === Band validation ===\n",
    "def is_in_band(score, band_name):\n",
    "    low, high = band_ranges[band_name]\n",
    "    # reject scores too close to boundaries\n",
    "    if band_name == \"easy\" and score < low + margin:\n",
    "        return False\n",
    "    if band_name == \"intermediate\" and (score < low + margin or score > high - margin):\n",
    "        return False\n",
    "    if band_name == \"hard\" and score > high - margin:\n",
    "        return False\n",
    "    return low <= score <= high\n",
    "\n",
    "# === Process Dataset ===\n",
    "with open(input_json, \"r\", encoding=\"utf-8\") as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "filtered_data = []\n",
    "scores_summary = defaultdict(list)\n",
    "removed_count = defaultdict(int)\n",
    "\n",
    "for item in data:\n",
    "    keep_item = True\n",
    "    invalid_bands = set()\n",
    "\n",
    "    for level in [\"easy\", \"intermediate\", \"hard\"]:\n",
    "        text = item[\"readability_versions\"][level][\"text\"]\n",
    "        score = huerta_score(text)\n",
    "        item[\"readability_versions\"][level][\"FH_score\"] = score\n",
    "        scores_summary[level].append(score)\n",
    "\n",
    "        if not is_in_band(score, level):\n",
    "            invalid_bands.add(level)\n",
    "            removed_count[level] += 1\n",
    "            keep_item = False  # remove if any version invalid\n",
    "\n",
    "    if keep_item:\n",
    "        filtered_data.append(item)\n",
    "\n",
    "# === Save filtered dataset ===\n",
    "with open(output_json, \"w\", encoding=\"utf-8\") as f:\n",
    "    json.dump(filtered_data, f, ensure_ascii=False, indent=2)\n",
    "\n",
    "# === Print stats ===\n",
    "print(f\"✅ Original dataset size: {len(data)}\")\n",
    "print(f\"✅ Filtered dataset size: {len(filtered_data)}\")\n",
    "print(f\"🗑️  Removed total: {len(data) - len(filtered_data)}\")\n",
    "print(\"\\n📊 Removal per readability band:\")\n",
    "for level in [\"easy\", \"intermediate\", \"hard\"]:\n",
    "    print(f\"  {level.capitalize():<15}: {removed_count[level]} removed\")\n",
    "\n",
    "# === Plot distribution ===\n",
    "plt.figure(figsize=(10, 6))\n",
    "for level, color in zip([\"easy\", \"intermediate\", \"hard\"], ['green', 'blue', 'red']):\n",
    "    plt.scatter([level]*len(scores_summary[level]), scores_summary[level],\n",
    "                color=color, label=level, alpha=0.6)\n",
    "plt.axhspan(70, 100, color='green', alpha=0.1, label='Easy Band')\n",
    "plt.axhspan(50, 70, color='blue', alpha=0.1, label='Intermediate Band')\n",
    "plt.axhspan(0, 50, color='red', alpha=0.1, label='Hard Band')\n",
    "plt.ylabel(\"Fernández Huerta Score\")\n",
    "plt.title(\"Fernández Huerta Scores per Readability Level\")\n",
    "plt.legend()\n",
    "plt.grid(alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03b3905c",
   "metadata": {},
   "source": [
    "## Command generator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0f8250c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def distribute_commands(all_ref,free_gpu):\n",
    "    new_li = []\n",
    "    num_gpus = len(free_gpu)\n",
    "    total = len(all_ref)\n",
    "    base_allocate = total // num_gpus\n",
    "    # assign gpu in all_ref commands\n",
    "    for g in range(num_gpus - 1):\n",
    "        temp = all_ref[g * base_allocate : (g + 1) * base_allocate]\n",
    "        temp = [d.replace(\"--cuda -1\", f\"--cuda {free_gpu[g]}\") for d in temp]\n",
    "        new_li.append(temp)\n",
    "    temp = all_ref[(num_gpus - 1) * base_allocate :]\n",
    "    temp = [d.replace(\"--cuda -1\", f\"--cuda {free_gpu[num_gpus - 1]}\") for d in temp]\n",
    "    new_li.append(temp)\n",
    "    return new_li"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6748b6ec",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# parser.add_argument(\"--cuda\", type=str, default=\"3\", help=\"CUDA device id, e.g., '0' or '0,1' for multiple GPUs\")\n",
    "# parser.add_argument(\"--model_name\", type=str, default=\"/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2\", help=\"Path to the finetuned model\")\n",
    "# parser.add_argument(\"--temperature\", type=float, default=0.1, help=\"Generation temperature\")\n",
    "all_cmds = []\n",
    "# '/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2'\n",
    "model_names = [ '/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2','unsloth/Qwen3-14B']\n",
    "for model_name in model_names:\n",
    "    # temp_list=[0.1, 0.3, 0.5, 0.7, 1.0, 1.1]\n",
    "    temp_list=[1.2,1.3,1.4,1.5]\n",
    "    for temp in temp_list:\n",
    "        cmd = f\"python /home/mshahidul/readctrl/code/finetune-inference/inferenceV2_without_context.py --model_name {model_name} --temperature {temp} --cuda -1\"\n",
    "        # cmd = f\"python /home/mshahidul/readctrl/code/finetune-inference/inferenceV3.py --model_name {model_name} --temperature {temp} --cuda -1\"\n",
    "        # cmd = f\"CUDA_VISIBLE_DEVICES=-1 python /home/mshahidul/readctrl/code/finetune-inference/inferenceV3_temp.py --model_name {model_name} --temperature {temp}\"\n",
    "        all_cmds.append(cmd)\n",
    "len(all_cmds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "673595ec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "python /home/mshahidul/readctrl/code/finetune-inference/inferenceV2_without_context.py --model_name /home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2 --temperature 1.2 --cuda 2\n",
      "python /home/mshahidul/readctrl/code/finetune-inference/inferenceV2_without_context.py --model_name /home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2 --temperature 1.3 --cuda 2\n",
      "python /home/mshahidul/readctrl/code/finetune-inference/inferenceV2_without_context.py --model_name /home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2 --temperature 1.4 --cuda 2\n",
      "python /home/mshahidul/readctrl/code/finetune-inference/inferenceV2_without_context.py --model_name /home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2 --temperature 1.5 --cuda 2\n"
     ]
    }
   ],
   "source": [
    "# gamma 2: 2, beta 3: 3\n",
    "free_gpu=[2,3]\n",
    "distributed_cmds = distribute_commands(all_cmds, free_gpu)\n",
    "for sets in distributed_cmds[0]:\n",
    "    print(sets)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f184d424",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "unsloth",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}