File size: 54,303 Bytes
c7a6fe6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 | {
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f2780f69",
"metadata": {},
"outputs": [],
"source": [
"ALL_PROMPTS = {\n",
" \"en\": {\n",
" \"B1\": \"\"\"You are a summarization assistant. Your single most important goal is to rewrite medical text for a first-grade reading level (ages 5-7, FKGL 1.0-4.0). Simplicity is more important than detail.\n",
"\n",
"Core Mandate:\n",
"- TARGET AUDIENCE: A 6-year-old child.\n",
"- PRIMARY GOAL: Extreme simplicity. If you must choose between accuracy of detail and simplicity, ALWAYS choose simplicity.\n",
"\n",
"Strict Rules You Must Follow:\n",
"- SENTENCE LENGTH: Keep almost all sentences under 10 words. Use very short, simple sentences.\n",
"- VOCABULARY: Use only very common, everyday words that a first-grader would know. Avoid any medical or scientific terms. Instead of 'femur', say 'thigh bone'. Instead of 'benign', say 'not harmful'.\n",
"- TONE: Be very gentle, calm, and reassuring. Like a kind doctor explaining something to a small child.\n",
"- STRUCTURE: Use short paragraphs, often just one or two sentences long.\n",
"- FOCUS: Only mention the most important one or two points from the original text. Omit all other details.\n",
"\n",
"- Never use emojis.\n",
"- Do not explain pronunciation.\n",
"- DO NOT use any medical jargon.\n",
"\"\"\",\n",
" \"B2\": \"\"\"You are a summarization assistant trained to rewrite medical summaries for a middle school reading level (ages 11–14, FKGL 6.0–9.0). Your goal is clarity for a teenager with a basic understanding of biology.\n",
"\n",
"Core Mandate:\n",
"- TARGET AUDIENCE: A 14-year-old in a 9th-grade biology class.\n",
"- PRIMARY GOAL: Clarity and straightforward explanation.\n",
"\n",
"Strict Rules You Must Follow:\n",
"- SENTENCE LENGTH: Vary sentence length, but aim for an average of 12-18 words. Avoid long, complex sentences.\n",
"- VOCABULARY: You can use basic medical terms (e.g., 'biopsy', 'cells', 'tumor'), but you MUST explain them in simple terms immediately. For example: \"A biopsy, which is when a small piece of tissue is taken for testing...\".\n",
"- TONE: Be empathetic but direct. Use an educational and informative tone, like a science teacher.\n",
"- STRUCTURE: Organize the summary into logical paragraphs. You can use simple headings if it helps clarity (e.g., \"What They Found,\" \"What It Means\").\n",
"- FOCUS: Summarize the main findings and their implications. Omit minor or highly technical details.\n",
"\n",
"- Never use emojis.\n",
"- Do not explain pronunciation.\n",
"\"\"\",\n",
" \"B3\": \"\"\"You are a summarization assistant trained to rewrite medical summaries for an educated, non-medical adult (ages 17+, FKGL 12.0+). Your goal is to be precise, comprehensive, and clear for a college-level reader.\n",
"\n",
"Core Mandate:\n",
"- TARGET AUDIENCE: A curious college student or adult with no medical training.\n",
"- PRIMARY GOAL: Precision and structured clarity.\n",
"\n",
"Strict Rules You Must Follow:\n",
"- SENTENCE LENGTH: Use clear, well-constructed sentences. Complex sentences are acceptable if they enhance clarity and precision.\n",
"- VOCABULARY: Use correct medical terminology. You can assume the reader can understand terms from context or look them up, but for very specialized terms, provide a brief parenthetical explanation. For example: \"...showed evidence of hyperplasia (an increase in the number of cells).\"\n",
"- TONE: Maintain a professional, empathetic, and respectful tone. Be authoritative but not clinical or cold.\n",
"- STRUCTURE: Provide a detailed and structured summary. Use headings to organize information, such as \"Background,\" \"Key Findings,\" \"Clinical Interpretation,\" and \"Next Steps.\"\n",
"- FOCUS: Be comprehensive and faithful to the source summary. Include important details, test results, and differential diagnoses mentioned in the source.\n",
"\n",
"- Never use emojis.\n",
"- Do not explain pronunciation.\n",
"\"\"\"\n",
" },\n",
" \"es\": {\n",
" \"B1\": \"\"\"Eres un asistente de resumen. Tu único y más importante objetivo es reescribir texto médico para un nivel de lectura de primer grado (edades 5-7). La simplicidad es más importante que el detalle.\n",
"\n",
"Mandato Principal:\n",
"- PÚBLICO OBJETIVO: Un niño de 6 años.\n",
"- OBJETIVO PRIMARIO: Simplicidad extrema. Si debes elegir entre la precisión del detalle y la simplicidad, SIEMPRE elige la simplicidad.\n",
"\n",
"Reglas Estrictas que Debes Seguir:\n",
"- IDIOMA: El resumen DEBE estar escrito en español.\n",
"- LONGITUD DE LA ORACIÓN: Casi todas las oraciones deben tener menos de 10 palabras. Usa frases muy cortas y simples.\n",
"- VOCABULARIO: Usa solo palabras cotidianas y muy comunes que un niño de primer grado conocería. Evita cualquier término médico o científico. En lugar de 'fémur', di 'hueso del muslo'. En lugar de 'benigno', di 'que no es dañino'.\n",
"- TONO: Sé muy gentil, calmado y tranquilizador. Como un doctor amable explicándole algo a un niño pequeño.\n",
"- ESTRUCTURA: Usa párrafos cortos, a menudo de solo una o dos oraciones.\n",
"- ENFOQUE: Menciona solo el punto más importante o los dos puntos más importantes del texto original. Omite todos los demás detalles.\n",
"\n",
"- Nunca uses emojis.\n",
"- No expliques la pronunciación.\n",
"- NO uses jerga médica.\n",
"\"\"\",\n",
" \"B2\": \"\"\"Eres un asistente de resumen entrenado para reescribir resúmenes médicos para un nivel de lectura de secundaria (edades 11–14). Tu objetivo es la claridad para un adolescente con conocimientos básicos de biología.\n",
"\n",
"Mandato Principal:\n",
"- PÚBLICO OBJETIVO: Un estudiante de 14 años en una clase de biología de secundaria.\n",
"- OBJETIVO PRIMARIO: Claridad y explicación directa.\n",
"\n",
"Reglas Estrictas que Debes Seguir:\n",
"- IDIOMA: El resumen DEBE estar escrito en español.\n",
"- LONGITUD DE LA ORACIÓN: Varía la longitud de las oraciones, pero busca un promedio de 12-18 palabras. Evita las oraciones largas y complejas.\n",
"- VOCABULARIO: Puedes usar términos médicos básicos (ej., 'biopsia', 'células', 'tumor'), pero DEBES explicarlos en términos sencillos inmediatamente. Por ejemplo: \"Una biopsia, que es cuando se toma un pequeño trozo de tejido para analizarlo...\".\n",
"- TONO: Sé empático pero directo. Usa un tono educativo e informativo, como un profesor de ciencias.\n",
"- ESTRUCTURA: Organiza el resumen en párrafos lógicos. Puedes usar encabezados simples si ayuda a la claridad (ej., \"Lo que Encontraron,\" \"Qué Significa\").\n",
"- ENFOQUE: Resume los hallazgos principales y sus implicaciones. Omite detalles menores o muy técnicos.\n",
"\n",
"- Nunca uses emojis.\n",
"- No expliques la pronunciación.\n",
"\"\"\",\n",
" \"B3\": \"\"\"Eres un asistente de resumen entrenado para reescribir resúmenes médicos para un adulto educado no médico (edades 17+). Tu objetivo es ser preciso, completo y claro para un lector de nivel universitario.\n",
"\n",
"Mandato Principal:\n",
"- PÚBLICO OBJETIVO: Un estudiante universitario o un adulto curioso sin formación médica.\n",
"- OBJETIVO PRIMARIO: Precisión y claridad estructurada.\n",
"\n",
"Reglas Estrictas que Debes Seguir:\n",
"- IDIOMA: El resumen DEBE estar escrito en español.\n",
"- LONGITUD DE LA ORACIÓN: Usa oraciones claras y bien construidas. Las oraciones complejas son aceptables si mejoran la claridad y la precisión.\n",
"- VOCABULARIO: Usa la terminología médica correcta. Puedes asumir que el lector puede entender los términos por el contexto o buscarlos, pero para términos muy especializados, proporciona una breve explicación entre paréntesis. Por ejemplo: \"...mostró evidencia de hiperplasia (un aumento en el número de células).\"\n",
"- TONO: Mantén un tono profesional, empático y respetuoso. Sé autoritario pero no clínico o frío.\n",
"- ESTRUCTURA: Proporciona un resumen detallado y estructurado. Usa encabezados para organizar la información, como \"Contexto,\" \"Hallazgos Clave,\" \"Interpretación Clínica,\" y \"Próximos Pasos.\"\n",
"- ENFOQUE: Sé completo y fiel al resumen original. Incluye detalles importantes, resultados de pruebas y diagnósticos diferenciales mencionados en la fuente.\n",
"\n",
"- Nunca uses emojis.\n",
"- No expliques la pronunciación.\n",
"\"\"\"\n",
" },\n",
"\"fr\": {\n",
" \"B1\": \"\"\"Vous êtes un assistant de résumé. Votre unique et plus important objectif est de réécrire un texte médical pour un niveau de lecture de cours préparatoire (âges 5-7). La simplicité est plus importante que le détail.\n",
"\n",
"Mandat Principal :\n",
"- PUBLIC CIBLE : Un enfant de 6 ans.\n",
"- OBJECTIF PRINCIPAL : Simplicité extrême. Si vous devez choisir entre la précision des détails et la simplicité, choisissez TOUJOURS la simplicité.\n",
"\n",
"Règles Strictes à Suivre Impérativement :\n",
"- LANGUE : Le résumé DOIT être rédigé en français.\n",
"- LONGUEUR DES PHRASES : Presque toutes les phrases doivent faire moins de 10 mots. Utilisez des phrases très courtes et simples.\n",
"- VOCABULAIRE : Utilisez uniquement des mots très courants et quotidiens qu'un enfant de cet âge connaîtrait. Évitez tout terme médical ou scientifique. Au lieu de 'fémur', dites 'l'os de la cuisse'. Au lieu de 'bénin', dites 'pas dangereux'.\n",
"- TON : Soyez très doux, calme et rassurant. Comme un médecin bienveillant qui explique quelque chose à un jeune enfant.\n",
"- STRUCTURE : Utilisez des paragraphes courts, souvent composés d'une ou deux phrases seulement.\n",
"- ENFOQUE : Mentionnez uniquement le ou les deux points les plus importants du texte original. Omettez tous les autres détails.\n",
"\n",
"- N'utilisez jamais d'emojis.\n",
"- N'expliquez pas la prononciation.\n",
"- N'utilisez AUCUN jargon médical.\n",
"\"\"\",\n",
" \"B2\": \"\"\"Vous êtes un assistant de résumé entraîné à réécrire des résumés médicaux pour un niveau de lecture de collège (âges 11–14). Votre objectif est la clarté pour un adolescent ayant une compréhension de base de la biologie.\n",
"\n",
"Mandat Principal :\n",
"- PUBLIC CIBLE : Un adolescent de 14 ans en classe de biologie au collège.\n",
"- OBJECTIF PRINCIPAL : Clarté et explication directe.\n",
"\n",
"Règles Strictes à Suivre Impérativement :\n",
"- LANGUE : Le résumé DOIT être rédigé en français.\n",
"- LONGUEUR DES PHRASES : Variez la longueur des phrases, mais visez une moyenne de 12-18 mots. Évitez les phrases longues et complexes.\n",
"- VOCABULAIRE : Vous pouvez utiliser des termes médicaux de base (ex: 'biopsie', 'cellules', 'tumeur'), mais vous DEVEZ les expliquer en termes simples immédiatement. Par exemple : \"Une biopsie, c'est-à-dire quand on prélève un petit morceau de tissu pour l'analyser...\".\n",
"- TON : Soyez empathique mais direct. Adoptez un ton pédagogique et informatif, comme un professeur de sciences.\n",
"- STRUCTURE : Organisez le résumé en paragraphes logiques. Vous pouvez utiliser des titres simples si cela améliore la clarté (ex: \"Ce qu'ils ont trouvé\", \"Ce que cela signifie\").\n",
"- ENFOQUE : Résumez les principales observations et leurs implications. Omettez les détails mineurs ou très techniques.\n",
"\n",
"- N'utilisez jamais d'emojis.\n",
"- N'expliquez pas la prononciation.\n",
"\"\"\",\n",
" \"B3\": \"\"\"Vous êtes un assistant de résumé entraîné à réécrire des résumés médicaux pour un adulte éduqué non-médecin (âges 17+). Votre objectif est d'être précis, complet et clair pour un lecteur de niveau universitaire.\n",
"\n",
"Mandat Principal :\n",
"- PUBLIC CIBLE : Un étudiant ou un adulte curieux sans formation médicale.\n",
"- OBJECTIF PRINCIPAL : Précision et clarté structurée.\n",
"\n",
"Règles Strictes à Suivre Impérativement :\n",
"- LANGUE : Le résumé DOIT être rédigé en français.\n",
"- LONGUEUR DES PHRASES : Utilisez des phrases claires et bien construites. Les phrases complexes sont acceptables si elles améliorent la clarté et la précision.\n",
"- VOCABULAIRE : Utilisez la terminologie médicale correcte. Vous pouvez supposer que le lecteur peut comprendre les termes par le contexte ou les rechercher, mais pour les termes très spécialisés, fournissez une brève explication entre parenthèses. Par exemple : \"...montrait des signes d'hyperplasie (une augmentation du nombre de cellules).\"\n",
"- TON : Maintenez un ton professionnel, empathique et respectueux. Soyez directif mais ni clinique ni froid.\n",
"- STRUCTURE : Fournissez un résumé détaillé et structuré. Utilisez des titres pour organiser l'information, tels que \"Contexte\", \"Principales Observations\", \"Interprétation Clinique\" et \"Prochaines Étapes\".\n",
"- ENFOQUE : Soyez complet et fidèle au résumé source. Incluez les détails importants, les résultats des tests et les diagnostics différentiels mentionnés dans la source.\n",
"\n",
"- N'utilisez jamais d'emojis.\n",
"- N'expliquez pas la prononciation.\n",
"\"\"\"\n",
"},\n",
"\n",
"\"pt\": {\n",
" \"B1\": \"\"\"Você é um assistente de resumo. O seu único e mais importante objetivo é reescrever textos médicos para um nível de leitura da primeira série (idades 5-7). A simplicidade é mais importante que os detalhes.\n",
"\n",
"Mandato Principal:\n",
"- PÚBLICO-ALVO: Uma criança de 6 anos.\n",
"- OBJETIVO PRINCIPAL: Simplicidade extrema. Se tiver que escolher entre a precisão dos detalhes e a simplicidade, ESCOLHA SEMPRE a simplicidade.\n",
"\n",
"Regras Rígidas que Você Deve Seguir:\n",
"- IDIOMA: O resumo DEVE ser escrito em português.\n",
"- COMPRIMENTO DAS FRASES: Quase todas as frases devem ter menos de 10 palavras. Use frases muito curtas e simples.\n",
"- VOCABULÁRIO: Use apenas palavras quotidianas e muito comuns que uma criança da primeira série conheceria. Evite qualquer termo médico ou científico. Em vez de 'fêmur', diga 'o osso da coxa'. Em vez de 'benigno', diga 'que não faz mal'.\n",
"- TOM: Seja muito gentil, calmo e tranquilizador. Como um médico amável a explicar algo a uma criança pequena.\n",
"- ESTRUTURA: Use parágrafos curtos, muitas vezes com apenas uma ou duas frases.\n",
"- FOCO: Mencione apenas um ou dois dos pontos mais importantes do texto original. Omita todos os outros detalhes.\n",
"\n",
"- Nunca use emojis.\n",
"- Não explique a pronúncia.\n",
"- NÃO use NENHUM jargão médico.\n",
"\"\"\",\n",
" \"B2\": \"\"\"Você é um assistente de resumo treinado para reescrever resumos médicos para um nível de leitura do ensino fundamental II (idades 11–14). O seu objetivo é a clareza para um adolescente com conhecimentos básicos de biologia.\n",
"\n",
"Mandato Principal:\n",
"- PÚBLICO-ALVO: Um adolescente de 14 anos numa aula de biologia.\n",
"- OBJETIVO PRINCIPAL: Clareza e explicação direta.\n",
"\n",
"Regras Rígidas que Você Deve Seguir:\n",
"- IDIOMA: O resumo DEVE ser escrito em português.\n",
"- COMPRIMENTO DAS FRASES: Varie o comprimento das frases, mas procure uma média de 12 a 18 palavras. Evite frases longas e complexas.\n",
"- VOCABULÁRIO: Pode usar termos médicos básicos (ex: 'biópsia', 'células', 'tumor'), mas você DEVE explicá-los em termos simples imediatamente. Por exemplo: \"Uma biópsia, que é quando um pequeno pedaço de tecido é retirado para ser analisado...\".\n",
"- TOM: Seja empático, mas direto. Use um tom educativo e informativo, como um professor de ciências.\n",
"- ESTRUTURA: Organize o resumo em parágrafos lógicos. Pode usar títulos simples se isso ajudar na clareza (ex: \"O que eles encontraram\", \"O que isso significa\").\n",
"- FOCO: Resuma os principais achados e as suas implicações. Omita detalhes menores ou muito técnicos.\n",
"\n",
"- Nunca use emojis.\n",
"- Não explique a pronúncia.\n",
"\"\"\",\n",
" \"B3\": \"\"\"Você é um assistente de resumo treinado para reescrever resumos médicos para um adulto instruído, mas sem formação médica (idades 17+). O seu objetivo é ser preciso, abrangente e claro para um leitor de nível universitário.\n",
"\n",
"Mandato Principal:\n",
"- PÚBLICO-ALVO: Um estudante universitário ou adulto curioso sem formação médica.\n",
"- OBJETIVO PRINCIPAL: Precisão e clareza estruturada.\n",
"\n",
"Regras Rígidas que Você Deve Seguir:\n",
"- IDIOMA: O resumo DEVE ser escrito em português.\n",
"- COMPRIMENTO DAS FRASES: Use frases claras e bem construídas. Frases complexas são aceitáveis se melhorarem a clareza e a precisão.\n",
"- VOCABULÁRIO: Use a terminologia médica correta. Pode assumir que o leitor consegue entender os termos pelo contexto ou pesquisá-los, mas para termos muito especializados, forneça uma breve explicação entre parênteses. Por exemplo: \"...mostrou evidência de hiperplasia (um aumento no número de células).\"\n",
"- TOM: Mantenha um tom profissional, empático e respeitoso. Seja confiante, mas não clínico ou frio.\n",
"- ESTRUTURA: Forneça um resumo detalhado e estruturado. Use títulos para organizar a informação, como \"Contexto\", \"Principais Achados\", \"Interpretação Clínica\" e \"Próximos Passos\".\n",
"- FOCO: Seja abrangente e fiel ao resumo original. Inclua detalhes importantes, resultados de testes e diagnósticos diferenciais mencionados na fonte.\n",
"\n",
"- Nunca use emojis.\n",
"- Não explique a pronúncia.\n",
"\"\"\"\n",
"}\n",
"\n",
"}\n",
"USER_PROMPT_TEMPLATES = {\n",
" \"en\": \"\"\"Please rewrite the following expert summary for the specified target audience. Use the full article for context if needed.\n",
"**Full Article Context:**\n",
"{article}\n",
"**Expert Summary to Rewrite:**\n",
"{gold_summary}\n",
"\"\"\",\n",
" \"es\": \"\"\"Por favor, reescribe el siguiente resumen de experto para el público objetivo especificado. Usa el artículo completo como contexto si es necesario.\n",
"**Contexto del Artículo Completo:**\n",
"{article}\n",
"**Resumen de Experto a Reescribir:**\n",
"{gold_summary}\n",
"\"\"\",\n",
" \"fr\": \"\"\"Veuillez réécrire le résumé d'expert suivant pour le public cible spécifié. Utilisez l'article complet comme contexte si nécessaire.\n",
"**Contexte de l'Article Complet :**\n",
"{article}\n",
"**Résumé d'Expert à Réécrire :**\n",
"{gold_summary}\n",
"\"\"\",\n",
" \"pt\": \"\"\"Por favor, reescreva o seguinte resumo de especialista para o público-alvo especificado. Use o artigo completo como contexto, se necessário.\n",
"**Contexto do Artigo Completo:**\n",
"{article}\n",
"**Resumo do Especialista a Ser Reescrito:**\n",
"{gold_summary}\n",
"\"\"\"\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2bb9ee67",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e40397cf",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"lang=\"es\"\n",
"with open('/home/mshahidul/readctrl/generating_data/tik_ache/es_syntheticV3.json', 'r', encoding='utf-8') as f:\n",
" data = json.load(f)\n",
"\n",
"converted = []\n",
"prompts_for_lang = ALL_PROMPTS.get(lang)\n",
"user_prompt_template = USER_PROMPT_TEMPLATES.get(lang)\n",
"for msg in data:\n",
" conversation={}\n",
" for key in msg['synthetic_summary'].keys():\n",
" system_prompt = prompts_for_lang[key]\n",
" sys_msg=msg['synthetic_summary'][key]\n",
" user_prompt = user_prompt_template.format(article=msg['article'], gold_summary=msg['gold_summary'])\n",
" conversation['conversations']= (\n",
" {'from': \"human\", 'content': system_prompt+'\\n'+user_prompt},\n",
" {'from': \"gpt\", 'content': sys_msg},\n",
" )\n",
" converted.append(conversation)\n",
"\n",
"# Save or print the result\n",
"with open(f'/home/mshahidul/readctrl/data_train/{lang}_train.json', 'w', encoding='utf-8') as f:\n",
" json.dump(converted, f, ensure_ascii=False, indent=2)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e4373e6c",
"metadata": {},
"outputs": [],
"source": [
"with open('/home/mshahidul/readctrl/data_train/es_train.json', 'r', encoding='utf-8') as f:\n",
" es_data = json.load(f)\n",
"print(es_data[0]['conversations'][1]['content'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4e8e1d2d",
"metadata": {},
"outputs": [],
"source": [
"def generate_prompt(article, gold_summary, band, lang):\n",
" \"\"\"Call an OpenAI model to generate a synthetic summary for a given readability band and language.\"\"\"\n",
" prompts_for_lang = ALL_PROMPTS.get(lang)\n",
" user_prompt_template = USER_PROMPT_TEMPLATES.get(lang)\n",
" if not prompts_for_lang or not user_prompt_template:\n",
" raise ValueError(f\"No prompts available for language: {lang}\")\n",
" \n",
" system_prompt = prompts_for_lang[band]\n",
" user_prompt = user_prompt_template.format(article=article, gold_summary=gold_summary)\n",
" return system_prompt + \"\\n\" + user_prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ddb14cb1",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"lang=\"es\"\n",
"with open('/home/mshahidul/readctrl/generating_data/tik_ache/es_syntheticV3.json', 'r', encoding='utf-8') as f:\n",
" data = json.load(f)\n",
"\n",
"converted = []\n",
"prompts_for_lang = ALL_PROMPTS.get(lang)\n",
"user_prompt_template = USER_PROMPT_TEMPLATES.get(lang)\n",
"for msg in data:\n",
" for key in msg['synthetic_summary'].keys():\n",
" conversation={}\n",
" system_prompt = prompts_for_lang[key]\n",
" sys_msg=msg['synthetic_summary'][key]\n",
" user_prompt = user_prompt_template.format(article=msg['article'], gold_summary=msg['gold_summary'])\n",
" conversation['conversations']= (\n",
" {'from': \"human\", 'content': system_prompt+'\\n'+user_prompt},\n",
" {'from': \"gpt\", 'content': sys_msg},\n",
" )\n",
" converted.append(conversation)\n",
"\n",
"# Save or print the result\n",
"with open(f'/home/mshahidul/readctrl/data_train/{lang}_train.json', 'w', encoding='utf-8') as f:\n",
" json.dump(converted, f, ensure_ascii=False, indent=2)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b82bd543",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"with open('/home/mshahidul/readctrl/synthetic_data_es_raw/0.json', 'r', encoding='utf-8') as f:\n",
" raw_es_data = json.load(f)\n",
"print(f\"easy:- {raw_es_data['readability_versions']['easy']['text']}\")\n",
"print(f\"intermediate:- {raw_es_data['readability_versions']['intermediate']['text']}\")\n",
"print(f\"hard:- {raw_es_data['readability_versions']['hard']['text']}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aca0ef62",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"\n",
"raw_dir = '/home/mshahidul/readctrl/synthetic_data_es_raw'\n",
"raw_files = [f for f in os.listdir(raw_dir) if f.endswith('.json')]\n",
"\n",
"raw_data_list = []\n",
"for fname in raw_files:\n",
" with open(os.path.join(raw_dir, fname), 'r', encoding='utf-8') as f:\n",
" raw_data_list.append(json.load(f))\n",
"\n",
"print(f\"Loaded {len(raw_data_list)} files from {raw_dir}\")\n",
"with open('/home/mshahidul/readctrl/data/hand_create_gpt5/es_rawV1.json', 'w', encoding='utf-8') as f:\n",
" json.dump(raw_data_list, f, ensure_ascii=False, indent=4)"
]
},
{
"cell_type": "markdown",
"id": "0c6d8fb6",
"metadata": {},
"source": [
"## dataset modified for training"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0899cccb",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"prompts={\n",
"\"easy\":'''\n",
"You are an assistant that rewrites Spanish texts to make them very simple and easy to understand.\n",
"Your goal is to rewrite the provided input text for younger readers (Fernández Huerta 70–100; grade 5–7).\n",
"Use short sentences, simple words, and friendly tone. Avoid technical or complex expressions.\n",
"Keep all important factual details, but remove jargon.\n",
"Return only the rewritten text without commentary.\n",
"''',\n",
"\n",
"'intermediate':'''\n",
"You are an assistant specialized in rewriting Spanish texts with medium readability.\n",
"Your task is to rewrite the provided input text for general or high‑school‑level readers (Fernández Huerta 50–70; grade 8–12).\n",
"Use clear and complete sentences, moderately complex vocabulary, and structured narration.\n",
"Retain all relevant medical or factual information, but phrase it in accessible language.\n",
"Return only the rewritten text with no explanations.\n",
"''',\n",
"\n",
"'hard':'''\n",
"You are an assistant that rewrites Spanish medical texts with professional, technical precision.\n",
"Rewrite the following input text using specialized, academic terminology and information‑dense phrasing.\n",
"The output must target a Fernández Huerta readability index between 0 and 50 (university/professional level).\n",
"Use clinical vocabulary, formal register, and detailed description of pathophysiology, procedures, and findings.\n",
"Return only the rewritten text.\n",
"'''\n",
"}\n",
"with open('/home/mshahidul/readctrl/data/hand_create_gpt5/es_rawV1.json', 'r', encoding='utf-8') as f:\n",
" gpt5_syn_es = json.load(f)\n",
"gpt5_syn_es[0]\n",
"import json\n",
"\n",
"with open('/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_es.json', 'r', encoding='utf-8') as f:\n",
" test_data = json.load(f)\n",
"\n",
"def full_text(id):\n",
" for item in test_data:\n",
" if item['id'] == id:\n",
" return item['fulltext']\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "38186215",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "a9ce8569",
"metadata": {},
"outputs": [],
"source": [
"converted = []\n",
"cnt=0\n",
"for item in gpt5_syn_es:\n",
" readability_data=item['readability_versions']\n",
" fulltext=full_text(item['id'])\n",
" for band, band_data in readability_data.items():\n",
" conversation={}\n",
" system_prompt=prompts[band]\n",
" conversation['conversations']= (\n",
" {'from': \"human\", 'content': system_prompt+'\\n\\n'+\"Input text:\\n\"+fulltext},\n",
" {'from': \"gpt\", 'content': band_data['text']},\n",
" )\n",
" converted.append(conversation)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52be9a01",
"metadata": {},
"outputs": [],
"source": [
"# [{'content': 'reasoning language: French\\n\\nYou are an AI chatbot with a lively and energetic personality.',\n",
"# 'role': 'system',\n",
"# 'thinking': None},\n",
"# {'content': 'Can you show me the latest trends on Twitter right now?',\n",
"# 'role': 'user',\n",
"# 'thinking': None},\n",
"# {'content': 'Hey there! While I can\\'t check Twitter (X) in real-time or access live data, I can share some tips to help you spot the latest trends:\\n\\n1. **Open the \"Trending\" tab** on the Twitter app or website – it updates constantly! \\n2. **Search for hashtags** like #Trending or #Viral to see what’s blowing up. \\n3. **Follow accounts** that curate trends (e.g., @TrendingNow, @ViralThreads). \\n4. **Check regional trends** – they often differ by location! \\n\\nRemember, trends are *super fast-moving* and often tied to pop culture, memes, or breaking news. For example, recent trends have included viral challenges (like the \"Distracted Boyfriend\" meme revival), celebrity drama, or unexpected events (hello, weather disasters!). \\n\\nWant me to brainstorm *what* might trend next? I’ve got ideas!',\n",
"# 'role': 'assistant',\n",
"# 'thinking': \"D'accord, l'utilisateur demande les tendances Twitter les plus récentes. Tout d'abord, je dois vérifier si j'ai accès à des données en temps réel. Étant donné que je ne peux pas naviguer sur Internet ou accéder directement à l'API de Twitter, je ne peux pas fournir des tendances en direct. Cependant, je peux donner quelques conseils généraux sur la façon de les trouver.\\n\\nJe devrais préciser que les tendances Twitter évoluent rapidement et sont spécifiques à chaque région. Je pourrais suggérer de consulter la section «\\xa0En vogue\\xa0» sur l'application ou le site web. Aussi, l'utilisation de hashtags et le suivi d'utilisateurs pertinents pourraient être utiles. Il est important de souligner que les tendances varient selon la région et l'heure de la journée. Je devrais garder un ton amical et bienveillant, peut-être ajouter un emoji pour rester léger. Je vais structurer ma réponse étape par étape pour faciliter la lecture. Je dois m'excuser de ne pas pouvoir fournir des données en temps réel et proposer d'autres méthodes. Je conserverai un langage simple et convivial, en évitant les termes techniques.\"}]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5a71fdf6",
"metadata": {},
"outputs": [],
"source": [
"converted = []\n",
"cnt=0\n",
"for item in gpt5_syn_es:\n",
" readability_data=item['readability_versions']\n",
" fulltext=full_text(item['id'])\n",
" for band, band_data in readability_data.items():\n",
" conversation={}\n",
" system_prompt=prompts[band]\n",
" conversation['messages']= (\n",
" {'role': \"system\", 'content': system_prompt, 'thinking': None},\n",
" {'role': \"user\", 'content': \"Input text:\\n\"+fulltext, 'thinking': None},\n",
" {'role': \"assistant\", 'content': band_data['text'], 'thinking': None},\n",
" )\n",
" converted.append(conversation)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7f173809",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "c20a9f4a",
"metadata": {},
"outputs": [],
"source": [
"with open(f'/home/mshahidul/readctrl/data/hand_create_gpt5/es_trainV1.json', 'w', encoding='utf-8') as f:\n",
" json.dump(converted, f, ensure_ascii=False, indent=4)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "975d8e1b",
"metadata": {},
"outputs": [],
"source": [
"import pyphen\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Initialize Spanish syllable dictionary\n",
"dic = pyphen.Pyphen(lang='es')\n",
"\n",
"# --- FH Score Functions ---\n",
"def count_syllables(word):\n",
" hyphenated = dic.inserted(word)\n",
" return len(hyphenated.split('-'))\n",
"\n",
"def huerta_score(text):\n",
" \"\"\"\n",
" Compute the Fernández Huerta readability score for Spanish text.\n",
" FH = 206.84 - 60 * (Syllables per Word) - 1.02 * (Words per Sentence)\n",
" \"\"\"\n",
" sentences = [s for s in text.split('.') if s.strip()]\n",
" words = [w for w in text.split() if w.isalpha()]\n",
" if not words or not sentences:\n",
" return 0.0\n",
" total_syllables = sum(count_syllables(word.lower()) for word in words)\n",
" avg_syllables_per_word = total_syllables / len(words)\n",
" avg_sentence_length = len(words) / len(sentences)\n",
" score = 206.84 - 60 * avg_syllables_per_word - 1.02 * avg_sentence_length\n",
" return round(score, 2)\n",
"\n",
"# --- Plotting Function ---\n",
"def plot_fh_scores(text_list):\n",
" scores = [huerta_score(t) for t in text_list]\n",
" indices = list(range(len(text_list)))\n",
"\n",
" plt.figure(figsize=(10, 5))\n",
" plt.plot(indices, scores, 'ko', label='FH Score')\n",
" plt.axhspan(70, 100, color='green', alpha=0.1, label='Easy (70-100)')\n",
" plt.axhspan(50, 70, color='blue', alpha=0.1, label='Intermediate (50-70)')\n",
" plt.axhspan(0, 50, color='red', alpha=0.1, label='Hard (0-50)')\n",
" plt.xlabel('Text Index')\n",
" plt.ylabel('Fernández Huerta Score')\n",
" plt.title('Fernández Huerta Readability Scores')\n",
" plt.legend()\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
" # Also print results\n",
" for i, s in enumerate(scores):\n",
" print(f\"Text {i}: FH Score = {s}\")\n",
"\n",
" # Example: Compute FH score for the \"hard\" band_data text\n",
" hard_text = band_data['text']\n",
" hard_score = huerta_score(hard_text)\n",
" print(f'Fernández Huerta score for \"hard\" band: {hard_score}')\n",
"# --- Example Usage ---\n",
"# texts = [\n",
"# \"Este es un texto muy simple y fácil de leer. Las oraciones son cortas.\",\n",
"# \"El presente documento aborda temas complejos relacionados con la neurociencia cognitiva y su aplicación en sistemas computacionales.\",\n",
"# \"El perro corre rápido. Juega con la pelota. Se divierte mucho.\"\n",
"# ]\n",
"\n",
"# plot_fh_scores(texts)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "804a3d10",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"test_en_path = '/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_en.json'\n",
"with open(test_en_path, 'r', encoding='utf-8') as f:\n",
" test_en_data = json.load(f)\n",
"\n",
"print(f\"Loaded {len(test_en_data)} items from {test_en_path}\")\n",
"print(test_en_data[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a230d18",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "e372abbf",
"metadata": {},
"source": [
"## Model accuracy check"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1190eb4b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------\n",
"temp0.1_qwen3-14B_base_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 49, 'intermediate': 14, 'hard': 9}\n",
"easy: 98.00%, intermediate: 28.00%, hard: 18.00%\n",
"--------------------------------------------------\n",
"temp0.3_qwen3-14B_base_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 48, 'intermediate': 15, 'hard': 10}\n",
"easy: 96.00%, intermediate: 30.00%, hard: 20.00%\n",
"--------------------------------------------------\n",
"temp0.5_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 37, 'intermediate': 32, 'hard': 17}\n",
"easy: 74.00%, intermediate: 64.00%, hard: 34.00%\n",
"--------------------------------------------------\n",
"temp1.3_qwen3-14B_base_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 46, 'intermediate': 25, 'hard': 24}\n",
"easy: 92.00%, intermediate: 50.00%, hard: 48.00%\n",
"--------------------------------------------------\n",
"temp1.1_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 40, 'intermediate': 30, 'hard': 29}\n",
"easy: 80.00%, intermediate: 60.00%, hard: 58.00%\n",
"--------------------------------------------------\n",
"temp1.0_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 43, 'intermediate': 32, 'hard': 18}\n",
"easy: 86.00%, intermediate: 64.00%, hard: 36.00%\n",
"--------------------------------------------------\n",
"temp1.5_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 24, 'intermediate': 26, 'hard': 33}\n",
"easy: 48.00%, intermediate: 52.00%, hard: 66.00%\n",
"--------------------------------------------------\n",
"temp1.3_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 29, 'intermediate': 38, 'hard': 29}\n",
"easy: 58.00%, intermediate: 76.00%, hard: 58.00%\n",
"--------------------------------------------------\n",
"temp0.7_qwen3-14B_base_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 48, 'intermediate': 16, 'hard': 10}\n",
"easy: 96.00%, intermediate: 32.00%, hard: 20.00%\n",
"--------------------------------------------------\n",
"temp0.5_qwen3-14B_base_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 48, 'intermediate': 20, 'hard': 9}\n",
"easy: 96.00%, intermediate: 40.00%, hard: 18.00%\n",
"--------------------------------------------------\n",
"temp0.7_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 43, 'intermediate': 23, 'hard': 11}\n",
"easy: 86.00%, intermediate: 46.00%, hard: 22.00%\n",
"--------------------------------------------------\n",
"temp1.4_qwen3-14B_base_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 48, 'intermediate': 27, 'hard': 26}\n",
"easy: 96.00%, intermediate: 54.00%, hard: 52.00%\n",
"--------------------------------------------------\n",
"temp1.1_qwen3-14B_base_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 48, 'intermediate': 16, 'hard': 13}\n",
"easy: 96.00%, intermediate: 32.00%, hard: 26.00%\n",
"--------------------------------------------------\n",
"temp1.4_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 28, 'intermediate': 27, 'hard': 30}\n",
"easy: 56.00%, intermediate: 54.00%, hard: 60.00%\n",
"--------------------------------------------------\n",
"temp0.1_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 40, 'intermediate': 32, 'hard': 16}\n",
"easy: 80.00%, intermediate: 64.00%, hard: 32.00%\n",
"--------------------------------------------------\n",
"temp1.2_qwen3-14B_base_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 48, 'intermediate': 20, 'hard': 28}\n",
"easy: 96.00%, intermediate: 40.00%, hard: 56.00%\n",
"--------------------------------------------------\n",
"temp0.3_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 40, 'intermediate': 32, 'hard': 9}\n",
"easy: 80.00%, intermediate: 64.00%, hard: 18.00%\n",
"--------------------------------------------------\n",
"temp1.5_qwen3-14B_base_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 47, 'intermediate': 20, 'hard': 33}\n",
"easy: 94.00%, intermediate: 40.00%, hard: 66.00%\n",
"--------------------------------------------------\n",
"temp1.0_qwen3-14B_base_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 47, 'intermediate': 18, 'hard': 16}\n",
"easy: 94.00%, intermediate: 36.00%, hard: 32.00%\n",
"--------------------------------------------------\n",
"temp1.2_qwen3-14B_finetuned_with_defs.json accuracy results:\n",
"{'easy': 50, 'intermediate': 50, 'hard': 50}\n",
"{'easy': 39, 'intermediate': 36, 'hard': 27}\n",
"easy: 78.00%, intermediate: 72.00%, hard: 54.00%\n"
]
}
],
"source": [
"import os\n",
"import pyphen\n",
"import matplotlib.pyplot as plt\n",
"band_ranges = {\n",
" \"easy\": (70, 100), # Easy\n",
" \"intermediate\": (50, 70), # Intermediate\n",
" \"hard\": (0, 50) # Hard\n",
"}\n",
"# Initialize Spanish syllable dictionary\n",
"dic = pyphen.Pyphen(lang='es')\n",
"\n",
"# --- FH Score Functions ---\n",
"def count_syllables(word):\n",
" hyphenated = dic.inserted(word)\n",
" return len(hyphenated.split('-'))\n",
"\n",
"def huerta_score(text):\n",
" \"\"\"\n",
" Compute the Fernández Huerta readability score for Spanish text.\n",
" FH = 206.84 - 60 * (Syllables per Word) - 1.02 * (Words per Sentence)\n",
" \"\"\"\n",
" sentences = [s for s in text.split('.') if s.strip()]\n",
" words = [w for w in text.split() if w.isalpha()]\n",
" if not words or not sentences:\n",
" return 0.0\n",
" total_syllables = sum(count_syllables(word.lower()) for word in words)\n",
" avg_syllables_per_word = total_syllables / len(words)\n",
" avg_sentence_length = len(words) / len(sentences)\n",
" score = 206.84 - 60 * avg_syllables_per_word - 1.02 * avg_sentence_length\n",
" return round(score, 2)\n",
"def accuracy_check(path):\n",
" import json\n",
" texts=[]\n",
" accuracy_data = {'easy': 0, 'intermediate': 0, 'hard': 0}\n",
" num_each_band = {'easy': 0, 'intermediate': 0, 'hard': 0}\n",
" with open(path, 'r', encoding='utf-8') as f:\n",
" results_es = json.load(f)\n",
"\n",
" for item in results_es:\n",
" dat=(item['synthetic_summary'].split(\"</think>\")[1].strip())\n",
" # print(item['band'])\n",
" band_data = item['band']\n",
" huerta_score_val = huerta_score(dat)\n",
" band_min, band_max = band_ranges[band_data]\n",
" if huerta_score_val >= band_min and huerta_score_val <= band_max:\n",
" accuracy_data[band_data] += 1\n",
" num_each_band[band_data] += 1\n",
" print(\"-\"*50)\n",
" print(f\"{os.path.basename(path)} accuracy results:\")\n",
" print(num_each_band)\n",
" print(accuracy_data)\n",
" print(f\"easy: {(accuracy_data['easy']/num_each_band['easy'])*100:.2f}%, intermediate: {(accuracy_data['intermediate']/num_each_band['intermediate'])*100:.2f}%, hard: {(accuracy_data['hard']/num_each_band['hard'])*100:.2f}%\")\n",
"for ind in os.listdir(\"/home/mshahidul/readctrl/results/custom_promptsV1\"):\n",
" if ind.endswith('.json'):\n",
" accuracy_check(os.path.join(\"/home/mshahidul/readctrl/results/custom_promptsV1\", ind))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6534a993",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"with open('/home/mshahidul/readctrl/data/hand_create_gpt5/es_trainV1.json', 'r', encoding='utf-8') as f:\n",
" data = json.load(f)\n",
"\n",
"print(len(data))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ed98df6a",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import pyphen\n",
"import matplotlib.pyplot as plt\n",
"from collections import defaultdict\n",
"\n",
"# === CONFIG ===\n",
"root = \"/home/mshahidul/readctrl/data/hand_create_gpt5\"\n",
"input_json = f\"{root}/es_rawV1.json\"\n",
"output_json = f\"{root}/filtered_es_rawV1.json\"\n",
"\n",
"band_ranges = {\n",
" \"easy\": (70, 100),\n",
" \"intermediate\": (50, 70),\n",
" \"hard\": (0, 50)\n",
"}\n",
"\n",
"# margin zone to remove texts near band boundaries\n",
"margin = 5 # e.g., 67–70 near easy; 47–50 near intermediate\n",
"\n",
"# === FH Score Calculation ===\n",
"dic = pyphen.Pyphen(lang='es')\n",
"\n",
"def count_syllables(word):\n",
" hyphenated = dic.inserted(word)\n",
" return len(hyphenated.split('-'))\n",
"\n",
"def huerta_score(text):\n",
" sentences = [s for s in text.split('.') if s.strip()]\n",
" words = [w for w in text.split() if w.isalpha()]\n",
" if not words or not sentences:\n",
" return 0.0\n",
" total_syllables = sum(count_syllables(word.lower()) for word in words)\n",
" avg_syllables_per_word = total_syllables / len(words)\n",
" avg_sentence_length = len(words) / len(sentences)\n",
" score = 206.84 - 60 * avg_syllables_per_word - 1.02 * avg_sentence_length\n",
" return round(score, 2)\n",
"\n",
"# === Band validation ===\n",
"def is_in_band(score, band_name):\n",
" low, high = band_ranges[band_name]\n",
" # reject scores too close to boundaries\n",
" if band_name == \"easy\" and score < low + margin:\n",
" return False\n",
" if band_name == \"intermediate\" and (score < low + margin or score > high - margin):\n",
" return False\n",
" if band_name == \"hard\" and score > high - margin:\n",
" return False\n",
" return low <= score <= high\n",
"\n",
"# === Process Dataset ===\n",
"with open(input_json, \"r\", encoding=\"utf-8\") as f:\n",
" data = json.load(f)\n",
"\n",
"filtered_data = []\n",
"scores_summary = defaultdict(list)\n",
"removed_count = defaultdict(int)\n",
"\n",
"for item in data:\n",
" keep_item = True\n",
" invalid_bands = set()\n",
"\n",
" for level in [\"easy\", \"intermediate\", \"hard\"]:\n",
" text = item[\"readability_versions\"][level][\"text\"]\n",
" score = huerta_score(text)\n",
" item[\"readability_versions\"][level][\"FH_score\"] = score\n",
" scores_summary[level].append(score)\n",
"\n",
" if not is_in_band(score, level):\n",
" invalid_bands.add(level)\n",
" removed_count[level] += 1\n",
" keep_item = False # remove if any version invalid\n",
"\n",
" if keep_item:\n",
" filtered_data.append(item)\n",
"\n",
"# === Save filtered dataset ===\n",
"with open(output_json, \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(filtered_data, f, ensure_ascii=False, indent=2)\n",
"\n",
"# === Print stats ===\n",
"print(f\"✅ Original dataset size: {len(data)}\")\n",
"print(f\"✅ Filtered dataset size: {len(filtered_data)}\")\n",
"print(f\"🗑️ Removed total: {len(data) - len(filtered_data)}\")\n",
"print(\"\\n📊 Removal per readability band:\")\n",
"for level in [\"easy\", \"intermediate\", \"hard\"]:\n",
" print(f\" {level.capitalize():<15}: {removed_count[level]} removed\")\n",
"\n",
"# === Plot distribution ===\n",
"plt.figure(figsize=(10, 6))\n",
"for level, color in zip([\"easy\", \"intermediate\", \"hard\"], ['green', 'blue', 'red']):\n",
" plt.scatter([level]*len(scores_summary[level]), scores_summary[level],\n",
" color=color, label=level, alpha=0.6)\n",
"plt.axhspan(70, 100, color='green', alpha=0.1, label='Easy Band')\n",
"plt.axhspan(50, 70, color='blue', alpha=0.1, label='Intermediate Band')\n",
"plt.axhspan(0, 50, color='red', alpha=0.1, label='Hard Band')\n",
"plt.ylabel(\"Fernández Huerta Score\")\n",
"plt.title(\"Fernández Huerta Scores per Readability Level\")\n",
"plt.legend()\n",
"plt.grid(alpha=0.3)\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"id": "03b3905c",
"metadata": {},
"source": [
"## Command generator"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "0f8250c5",
"metadata": {},
"outputs": [],
"source": [
"def distribute_commands(all_ref,free_gpu):\n",
" new_li = []\n",
" num_gpus = len(free_gpu)\n",
" total = len(all_ref)\n",
" base_allocate = total // num_gpus\n",
" # assign gpu in all_ref commands\n",
" for g in range(num_gpus - 1):\n",
" temp = all_ref[g * base_allocate : (g + 1) * base_allocate]\n",
" temp = [d.replace(\"--cuda -1\", f\"--cuda {free_gpu[g]}\") for d in temp]\n",
" new_li.append(temp)\n",
" temp = all_ref[(num_gpus - 1) * base_allocate :]\n",
" temp = [d.replace(\"--cuda -1\", f\"--cuda {free_gpu[num_gpus - 1]}\") for d in temp]\n",
" new_li.append(temp)\n",
" return new_li"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "6748b6ec",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# parser.add_argument(\"--cuda\", type=str, default=\"3\", help=\"CUDA device id, e.g., '0' or '0,1' for multiple GPUs\")\n",
"# parser.add_argument(\"--model_name\", type=str, default=\"/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2\", help=\"Path to the finetuned model\")\n",
"# parser.add_argument(\"--temperature\", type=float, default=0.1, help=\"Generation temperature\")\n",
"all_cmds = []\n",
"# '/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2'\n",
"model_names = [ '/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2','unsloth/Qwen3-14B']\n",
"for model_name in model_names:\n",
" # temp_list=[0.1, 0.3, 0.5, 0.7, 1.0, 1.1]\n",
" temp_list=[1.2,1.3,1.4,1.5]\n",
" for temp in temp_list:\n",
" cmd = f\"python /home/mshahidul/readctrl/code/finetune-inference/inferenceV2_without_context.py --model_name {model_name} --temperature {temp} --cuda -1\"\n",
" # cmd = f\"python /home/mshahidul/readctrl/code/finetune-inference/inferenceV3.py --model_name {model_name} --temperature {temp} --cuda -1\"\n",
" # cmd = f\"CUDA_VISIBLE_DEVICES=-1 python /home/mshahidul/readctrl/code/finetune-inference/inferenceV3_temp.py --model_name {model_name} --temperature {temp}\"\n",
" all_cmds.append(cmd)\n",
"len(all_cmds)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "673595ec",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"python /home/mshahidul/readctrl/code/finetune-inference/inferenceV2_without_context.py --model_name /home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2 --temperature 1.2 --cuda 2\n",
"python /home/mshahidul/readctrl/code/finetune-inference/inferenceV2_without_context.py --model_name /home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2 --temperature 1.3 --cuda 2\n",
"python /home/mshahidul/readctrl/code/finetune-inference/inferenceV2_without_context.py --model_name /home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2 --temperature 1.4 --cuda 2\n",
"python /home/mshahidul/readctrl/code/finetune-inference/inferenceV2_without_context.py --model_name /home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2 --temperature 1.5 --cuda 2\n"
]
}
],
"source": [
"# gamma 2: 2, beta 3: 3\n",
"free_gpu=[2,3]\n",
"distributed_cmds = distribute_commands(all_cmds, free_gpu)\n",
"for sets in distributed_cmds[0]:\n",
" print(sets)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f184d424",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "unsloth",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|