File size: 56,001 Bytes
163107e
 
 
 
 
 
 
 
 
 
 
 
fa1d9ca
163107e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
import io
import os
import re
import math
import random
import asyncio
import textwrap
import pandas as pd

from docx import Document
from loguru import logger

from .r2_utils import (
    upload_text_to_minio,
    upload_dataframe_to_minio,
    upload_document_to_minio,
    get_file_from_minio
)
from .common_utils import escape_csv_field


BUCKET_NAME = "ai-scientist"


# Function to check relevance and obtain keywords as reason
async def is_relevant(title, abstract, topic, direction, chat_func):
    """ 
    Check if a paper is relevant to a topic and obtain keywords as reason.
    
    Args:
        title (str): Title of the paper.
        abstract (str): Abstract of the paper.
        topic (str): Topic to check relevance against.
        direction (str): Direction to check relevance against.
        chat_func (function): Function to call the chat model.

    Returns:
        bool: True if the paper is relevant, False otherwise.
        str: Keywords that indicate relevance.
    
    """
    relevance_prompt = (
        f"You are an academic expert specializing in the field of {topic}. Your task is to determine if the following paper is relevant to the research direction described as '{direction}'.\n\n"
        "Please follow this reasoning process:\n"
        "1. Carefully read the paper's title and abstract.\n"
        "2. Identify the core research area, methodology, results, or focal points presented in the paper.\n"
        "3. Compare these core elements to the given research direction. Consider whether the paper directly addresses, contributes to, or is closely aligned with the stated direction.\n"
        "4. If the paper aligns conceptually, methodologically, or thematically with the direction, then it is relevant. If it is only tangential or unrelated, it is not relevant.\n"
        "5. From the text, select the main keywords that strongly indicate relevance (if relevant). These keywords should be key concepts, terms, or phrases that link the paper’s content to the given research direction.\n"
        "6. If not relevant, you can provide no keywords or give a brief note indicating no strong linkage.\n\n"
        "You must provide the answer in the following exact format:\n"
        "Relevance: True or False\n"
        "Keywords: [Comma-separated keywords]\n\n"
        f"Title: {title}\n"
        f"Abstract: {abstract}\n"
    )
    response = await chat_func(relevance_prompt)
    if response is None:
        return False, "Relevance check unavailable due to server error."

    try:
        response_text = response.choices[0].message.content
        relevance = "True" in response_text
        keywords = response_text.split(
            "Keywords:")[-1].strip() if "Keywords:" in response_text else ""
        return relevance, keywords
    except AttributeError:
        logger.error("Error in chat_func response format:", response)
        return False, "Relevance check failed"


# Modified summarize_abstract function with error handling for failed completion requests
async def summarize_abstract(title, abstract, first_author, chat_func):
    """
    Summarize the abstract of a research paper.
    
    Args:
        title (str): Title of the paper.
        abstract (str): Abstract of the paper.
        first_author (str): Name of the first author.
        chat_func (function): Function to call the chat model.

    Returns:
        str: Summary of the abstract.
    
    """
    formatted_author = reformat_author_name(first_author)
    
    # decision_prompt仍然维持原有逻辑,用于判断摘要类型
    decision_prompt = (
        f"Your task is to decide the type of summary needed based on the abstract.\n\n"
        f"Instructions:\n"
        f"- If the study primarily introduces, describes, or refines a method, technique, model, or computational approach, "
        f"with its main contribution being methodological rather than a discovery about a phenomenon, then output:\n"
        f"Output: full\n\n"
        f"- If the study primarily reports a new discovery, finding, result, or empirical outcome about a certain phenomenon, "
        f"biological entity, material property, or theoretical insight, then output:\n"
        f"Output: concise\n\n"
        f"Make your decision strictly based on the abstract content. Do not provide explanations or reasoning, "
        f"only the exact output word as instructed.\n\n"
        f"Title: {title}\nAbstract: {abstract}\n"
    )
    
    # full_summary_prompt不再要求使用第一作者信息,只需要两句话总结主要发现
    full_summary_prompt = (
        "In exactly two sentences, provide a high-level summary of the study’s key findings, "
        "while maintaining concrete technical terms, methodologies, and specific entities. "
        # "Do not use 'this study', 'the authors', or similar phrases as the subject; instead, use a proper noun or specific entity mentioned or implied in the abstract as the subject. "
        "Use clear and advanced language without generalizing or replacing specific methods with vague terms.\n\n"
        f"The summary should use clear, advanced language and mention the first author {formatted_author} followed by 'et al.':\n\n"
        f"Title: {title}\nAbstract: {abstract}\n\n"
        f"Summary by {formatted_author} et al.:"
    )
    
    # concise_summary_prompt不再要求使用第一作者信息,只需要一句话总结主要发现
    concise_summary_prompt = (
        "In two sentence, provide a precise statement of the study’s main finding without generalizing and without making the study itself the subject. "
        "Do not use 'this study', 'the authors', or similar phrases as the subject; instead, use a proper noun or specific entity mentioned or implied in the abstract as the subject of the sentence. "
        "Directly present the finding as the sentence’s focus, using advanced and specific language.\n\n"
        f"Title: {title}\nAbstract: {abstract}\n\n"
    )

    response_decision = await chat_func(decision_prompt)
    response_decision = response_decision.choices[0].message.content.strip().lower()
    
    if response_decision and "full" in response_decision:
        prompt_summary = full_summary_prompt
    else:
        prompt_summary = concise_summary_prompt
        
    response = await chat_func(prompt_summary)
    
    if response is None:
        return "Summary unavailable due to server error."

    try:
        result = response.choices[0].message.content.strip()
        result_words = result.split()
        summary = " ".join(result_words)
        return summary
    except AttributeError:
        logger.error("Error in chat_func response format:", response)
        return "Summary unavailable"


# Function to reformat first author name
def reformat_author_name(author_name):
    """
    Reformat the first author name by removing commas.
    
    Args:
        author_name (str): Name of the first author.
        
    Returns:
        str: Reformatted name of the first author.
        
    """
    try:
        return author_name.replace(",", "")
    except AttributeError:
        return "Unknown Author"


# Function to generate 3-5 hierarchical subheadings related to the main topic
async def generate_subheadings(
    relevant_papers_df, main_topic, 
    uuid, customer_name, model_name,
    chat_func
):
    """
    Generate 3-5 hierarchical subheadings related to the main topic based on the summaries of relevant papers.
    
    Args:
        relevant_papers_df: DataFrame containing relevant papers.
        main_topic: Main topic of the research.
        chat_func: Function to send chat messages to the chatbot.
        
    Returns:
        List[str]: List of generated subheadings.
    
    """
    # Determine the number of subheadings based on the number of rows
    num_papers = len(relevant_papers_df)
    if num_papers < 10:
        num_subheadings = 1
    elif num_papers <= 20:
        num_subheadings = 2
    elif num_papers <= 40:
        num_subheadings = 3
    elif num_papers <= 60:
        num_subheadings = 4
    elif num_papers <= 100:
        num_subheadings = 5
    else:
        num_subheadings = 6

    # Generate the summaries for the prompt
    summaries = " ".join(relevant_papers_df['Summary'].tolist())
    
    # Create the improved prompt
    prompt = (
        f"Consider the following main topic: '{main_topic}'. You are given a set of summaries extracted from relevant research papers related to this topic. Your goal is to generate {num_subheadings} hierarchical subheadings that clearly reflect and logically organize the key concepts and themes found in these summaries.\n\n"
        "Instructions:\n"
        "1. Carefully read and analyze the provided summaries.\n"
        "2. Identify broad thematic categories directly mentioned or strongly implied by the summaries. These should serve as the starting points for the subheadings.\n"
        "3. Arrange the subheadings in a hierarchical manner: start with the most general or foundational aspects of the main topic, then move progressively towards more specific, nuanced, or advanced themes.\n"
        "4. Ensure that each subheading is distinct and does not overlap in scope or content with the others. Every subheading should be directly supported by information present in the summaries.\n"
        "5. Do not introduce concepts that are not reflected in the summaries. All subheadings must be grounded in the text provided.\n"
        "6. The final output should be a simple list of subheadings, each preceded by a hyphen, without additional explanation or commentary.\n\n"
        f"Summaries:\n{summaries}\n\n"
        "Output format:\n- Subheading 1\n- Subheading 2\n- Subheading 3\n..."
    )
    
    response = await chat_func(prompt)
    subheadings = response.choices[0].message.content.strip().splitlines()
    subheadings = [subheading.replace(r"[-*']", '').strip() for subheading in subheadings]
    subheadings = [subheading.replace(r"- ", '').strip() for subheading in subheadings]
    subheadings = [re.sub(r"^[^\w]+|[^\w]+$", '', subheading).strip() 
                   for subheading in subheadings]
    subheadings = subheadings[:num_subheadings]
    logger.info("Generated Subheadings:\n" + "\n".join(subheadings))
    
    output_filename = f"{customer_name}/{uuid}/{model_name}/generated_subheadings.txt"
    await upload_text_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=output_filename,
        file_content="\n".join(subheadings)
    )
    logger.info(f"Subheadings saved to {output_filename}")
    return subheadings


# Function to assign summaries to subheadings with minimum allocation of references per subheading
async def assign_subheadings_to_summaries(
    relevant_papers_df, 
    subheadings, 
    uuid, customer_name, model_name,
    chat_func
):
    """
    Assign summaries to subheadings with minimum allocation of references per subheading.
    
    Args:
        relevant_papers_df: DataFrame containing relevant papers.
        subheadings: List of subheadings.
        uuid: Unique identifier for the task.
        customer_name: Name of the customer.
        chat_func: Function to send chat messages to the chatbot.

    Returns:
        DataFrame with assigned subheadings.
    
    """
    total_papers = len(relevant_papers_df)
    min_papers_per_subheading = math.ceil(total_papers / (len(subheadings) + 1))

    assigned_subheadings = []
    prompts = []
    for summary in relevant_papers_df['Summary']:
        prompt = (
            # 对模型的指令明确化
            f"Given the following subheadings and a research paper summary, identify the single most appropriate subheading for the provided summary. "
            f"You must carefully analyze the semantic content, thematic focus, and logical structure within the summary. "
            f"Ensure that the chosen subheading closely matches the core topic, key findings, research objectives, or main arguments of the paper summary. "
            f"Do not select a subheading that only partially fits; the chosen subheading should represent a strong and direct thematic alignment with the summary's central ideas. "
            f"Each subheading covers a distinct aspect or theme. Avoid overlaps by choosing the one that best captures the essence of the summary. "
            f"If a subheading does not logically or semantically align with the main theme or content described in the summary, it should not be chosen.\n\n"
        
            # 提供小标题列表
            f"Subheadings:\n{subheadings}\n\n"
        
            # 提供文献摘要
            f"Summary:\n{summary}\n\n"
        
            # 请求结果格式
            "Output format:\nSubheading: [Chosen subheading]"
        )
        prompts.append(prompt)
    responses = await asyncio.gather(
        *(chat_func(prompt) for prompt in prompts)
    )
    for response in responses:
        assigned_subheading = response.choices[0].message.content.split(": ", 1)[1]
        assigned_subheadings.append(assigned_subheading)

    relevant_papers_df['Assigned Subheading'] = assigned_subheadings

    # Ensure minimum papers per subheading
    counts = relevant_papers_df['Assigned Subheading'].value_counts().to_dict()
    for subheading in subheadings:
        if counts.get(subheading, 0) < min_papers_per_subheading:
            extra_summaries = relevant_papers_df[relevant_papers_df['Assigned Subheading'] != subheading].sample(
                min_papers_per_subheading - counts.get(subheading, 0)
            )
            relevant_papers_df.loc[extra_summaries.index,
                                   'Assigned Subheading'] = subheading
            
    relevant_papers_df['Assigned Subheading'] = (
        relevant_papers_df['Assigned Subheading']
        .str.replace(r"^[^\w]+|[^\w]+$", '', regex=True)  # 去除开头和结尾的非字母数字字符
        .str.strip()  # 去除字符串两端的空格
    )

    prefix = f"{customer_name}/{uuid}/{model_name}/"
    output_dir = prefix

    csv_filename = os.path.join(output_dir, f"assigned_subheadings.csv")

    # relevant_papers_df.to_csv(csv_filename, index=False, encoding='utf-8')
    await upload_dataframe_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=csv_filename,
        df=relevant_papers_df,
    )

    logger.info(f"Assigned subheadings saved to {csv_filename}")
    logger.info(f"Found {len(relevant_papers_df)} related papers")

    return relevant_papers_df


async def get_sorting_suggestions(subheading, sub_df, chat_func):
    # Add original index column to sub_df to retain original paper number
    sub_df = sub_df.copy()  # Avoid SettingWithCopyWarning
    sub_df.reset_index(drop=True, inplace=True)
    sub_df.index = sub_df.index + 1
    sub_df['Original Index'] = sub_df.index

    paper_num = sub_df.shape[0]
    logger.info(paper_num)

    if paper_num > 1:
        # Combine summaries into one string, appending author information
        summaries_text = '\n'.join(
            [f"Paper {row['Original Index']} by {row['First Author']}:\nSummary: {row['Summary']}\nRelevance Keywords: {row['Relevance Keywords']}" 
            for _, row in sub_df.iterrows()]
        )
        logger.info(summaries_text)
        
        prompt = (
            f"You are an experienced scientist tasked with organizing a collection of {paper_num} papers under the subheading '{subheading}' for a scientific review article.\n\n"
        
            "You have the following input:\n"
            "1. A set of papers, each with a summary and relevance keywords.\n"
            "2. A need to arrange these papers in a coherent and logical order that supports a narrative flow in a review article.\n\n"
        
            "Please address the following tasks:\n\n"
            "1. **Identify Key Themes and Group Papers:**\n"
            "- First, thoroughly read the summaries and relevance keywords of all the provided papers.\n"
            "- Determine distinct thematic groups or categories. A thematic group can be based on shared methodology, a common theoretical framework, a particular type of material, organism, phenomenon, or a progressive line of inquiry.\n"
            "- The grouping should reflect logical subdivisions that a reader of a review article could follow. For instance:\n"
            "  - Start with foundational or broadly relevant studies that introduce key concepts, contexts, or basic methods.\n"
            "  - Follow with papers that build upon these foundations, introducing more advanced techniques, deeper investigations, specialized findings, or novel approaches.\n"
            "  - Conclude with cutting-edge, most specialized, or recently introduced concepts that push the boundaries of the field.\n"
            "- If certain papers align well as a stepping stone from one theme to another, position them accordingly to create a smooth thematic transition.\n\n"
        
            "2. **Determine the Logical Order Within Each Group:**\n"
            "- Within each thematic group, arrange the papers in an order that naturally builds understanding. Consider:\n"
            "  - Present foundational or earlier conceptual frameworks before more advanced or derivative studies.\n"
            "  - Highlight any chronological clues (if provided) or logical sequences, such as a method introduced in one paper being applied or expanded in a later paper.\n"
            "  - Move from general to specific, from simpler methodologies to more complex analyses, or from well-established concepts to more tentative or innovative ones.\n\n"
        
            "3. **Combine Groups into a Cohesive Narrative:**\n"
            "- After organizing papers within their groups, merge the groups into a single final list.\n"
            "- The final list should read like a storyline: start with a broad, conceptual or methodological foundation, then move through intermediate studies that expand and refine these ideas, and end with the most advanced, specialized, or novel findings.\n"
            "- Ensure that transitions between groups make sense, helping a reader follow a narrative where each section logically paves the way for the next.\n\n"
        
            "4. **Provide the Final Ordered List:**\n"
            "- Present the final ordered list as a numbered list from 1 to {paper_num}.\n"
            "- Each entry should include the original paper number and the first author's name in the following format:\n"
            "   <Final Position>. <Original Paper Number>. (<First Author's Last Name>)\n\n"
            "For example:\n"
            "1. 3. (Smith)\n"
            "2. 1. (Johnson)\n"
            "3. 5. (Williams)\n\n"
            "All papers must appear once, and each final position should be unique. Do not omit any papers.\n\n"
        
            "Below are the papers:\n\n"
            f"{summaries_text}\n\n"
        
            "Please reflect on the thematic connections and carefully arrange the papers according to the instructions above."
        )

        # Retry mechanism to handle mismatches
        sorting_order = []
        sorting_response = await chat_func(prompt)  # Replace with your chat model interface
        sorting_suggestion = sorting_response.choices[0].message.content.strip()
        logger.info(f'Sorting suggestion:{sorting_suggestion}')
        matches = re.findall(r'(\d+)\.\s*(\d+)\.\s*\((.*?)\)', sorting_suggestion)

        # Debugging: print out raw matches to verify correctness
        logger.info(f"Matches found: {matches}")

        for match in matches:
            original_num = int(match[0])  # Original number
            new_num = int(match[1])       # Recommended number
            author = match[2].strip()     # Author name
            sorting_order.append((original_num, new_num, author))
    else:
        author = sub_df["Fisrt Author"].values[0]
        sorting_order.append((1, 1, author))
    
    # Ensure no duplicate new numbers and correct count
    new_nums = [x[1] for x in sorting_order]
    if len(sorting_order) == paper_num and len(set(new_nums)) == paper_num:
        pass  # Sorting succeeded, break the loop
    elif abs(len(sorting_order) - paper_num) <= 2:
        logger.info(f"Warning: Sorting order mismatch, difference of {abs(len(sorting_order) - paper_num)}. Assigning missing positions.")
        existing_sorted_numbers = [x[1] for x in sorting_order]
        missing_numbers = set(range(1, paper_num + 1)) - set(existing_sorted_numbers)
        
        for idx, original_num in enumerate(range(1, paper_num + 1)):
            if original_num not in existing_sorted_numbers:
                random_new_num = random.choice(list(missing_numbers))
                sorting_order.append((original_num, random_new_num, "Unknown Author"))  # Placeholder author
                missing_numbers.remove(random_new_num)

    # Sort by recommended number
    sorting_order.sort(key=lambda x: x[1])  # Sort by new number

    # Extract sorted original indices
    final_sorted_order = [item[0] for item in sorting_order]

    logger.info(f"Final sorted order: {final_sorted_order}")

    # Reorder sub_df based on the sorted order
    try:
        sorted_indices = [sub_df[sub_df['Original Index'] == idx].index[0] for idx in final_sorted_order]
        sorted_sub_df = sub_df.loc[sorted_indices].reset_index(drop=True)
    except Exception as e:
        logger.error(f"Error in sorting DataFrame: {e}")
        raise ValueError("Reordering of DataFrame failed.")
    
    return sorted_sub_df


# Function to create expanded paragraphs with required reference count and consistent reference indexing
async def create_paragraphs_by_subheading(
    relevant_papers_df, subheadings, main_topic, 
    uuid, customer_name, model_name,
    chat_func
):
    """
    Create expanded paragraphs by subheading with required reference count and consistent reference indexing.
    
    Args:
        relevant_papers_df (pd.DataFrame): DataFrame containing relevant papers and their summaries.
        subheadings (list): List of subheadings for the review paper.
        main_topic (str): Main topic of the review paper.
        uuid (str): UUID of the task.
        customer_name (str): Name of the customer.
        chat_func (function): Function to send chat messages to the chatbot.
        
    Returns:
        list: List of paragraphs with subheadings and consistent reference indexing.
    
    """
    paragraphs = []

    # Reorder relevant_papers_df based on the subheadings order
    subheading_order = {subheading: idx for idx, subheading in enumerate(subheadings)}
    relevant_papers_df['Subheading Order'] = \
        relevant_papers_df['Assigned Subheading'].map(subheading_order)
    
    # Remove rows where 'Subheading Order' is NA
    relevant_papers_df = relevant_papers_df.dropna(subset=['Subheading Order'])
    
    relevant_papers_df = relevant_papers_df.sort_values(by='Subheading Order')
    
    relevant_papers_df.reset_index(drop=True, inplace=True)
    await upload_dataframe_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=f"{customer_name}/{uuid}/{model_name}/relevant_papers_sort.csv",
        df=relevant_papers_df,
    )
    
    # Split relevant_papers_df by 'Assigned Subheading' into separate sub-dataframes
    subheading_groups = relevant_papers_df.groupby('Assigned Subheading')
    
    sub_dfs = []
    sorted_sub_dataframes = []
    for subheading in subheadings:
        # Check if subheading exists in subheading_groups
        if subheading in subheading_groups.groups:
            sub_df = subheading_groups.get_group(subheading)
            sub_dfs.append(sub_df)
            
    sorted_sub_dataframes = await asyncio.gather(
        *(get_sorting_suggestions(subheading, sub_df, chat_func)
          for sub_df in sub_dfs)
    )
    
    sorted_sub_dataframes = [x for x in sorted_sub_dataframes if not x.empty]
    
    # Concatenate all sorted sub-dataframes and reset index
    if sorted_sub_dataframes:
        final_relevant_papers_df = pd.concat(sorted_sub_dataframes).reset_index(drop=True)
        final_relevant_papers_df.index = final_relevant_papers_df.index + 1  # Start from index 1
        final_relevant_papers_df['ref_index'] = final_relevant_papers_df.index  # Add ref_index column
    else:
        logger.error("Error: No valid sub-dataframes to concatenate.")
        final_relevant_papers_df = pd.DataFrame()  # Create an empty DataFrame in case of error

    final_relevant_papers_df = final_relevant_papers_df.drop_duplicates()
    logger.info(final_relevant_papers_df.head())
    
    # Introduction
    intro_prompt = (
        f"Write a concise and advanced introductory paragraph for a scientific review paper on '{main_topic}'. "
        "Introduce the topic, its importance, and the scope of the review. The introduction should provide a logical "
        "setup for the following subheadings.\n\n"
        "Output format:\n[Write introduction here]"
    )
    intro_response = await chat_func(intro_prompt)
    intro_paragraph = intro_response.choices[0].message.content.strip()
    paragraphs.append(f"**Introduction**\n{intro_paragraph}\n")

    used_titles = set()
    summaries_text_by_subheading = {subheading: [] for subheading in subheadings}
    ref_index_map = {}
    
    for subheading in subheadings:
        relevant_summaries = final_relevant_papers_df[
            final_relevant_papers_df['Assigned Subheading'] == subheading
        ]

        for idx, (summary, title, author, pub_date, ref_index) in relevant_summaries[
            ['Summary', 'Title', 'First Author', 'Publication Date', 'ref_index']
        ].iterrows():
            if title in used_titles:
                continue
            used_titles.add(title)
            ref_index_map[title] = ref_index
            summaries_text_by_subheading[subheading].append(
                f"{summary} [Ref: {ref_index}]"
            )
    
    logger.info(summaries_text_by_subheading)
    paragraph_prompts = []
    for subheading in subheadings:
        summaries_text = summaries_text_by_subheading[subheading]

        # Adjust word_size based on the number of summaries
        num_summaries = len(summaries_text)
        if num_summaries < 10:
            word_size = num_summaries * 200 + 200  # If fewer than 10 summaries
        elif num_summaries > 30:
            word_size = num_summaries * 400 + 800  # If more than 20 summaries
        elif num_summaries > 20:
            word_size = num_summaries * 350 + 500  # If more than 20 summaries
        else:
            word_size = num_summaries * 250 + 300  # Otherwise, the default case
        
        # Generate the detailed paragraph for the subheading
        paragraph_prompt = (
            # f"Write a {word_size}-word thematically focused and critical paragraph under the subheading '{subheading}' for a scientific review on '{subheading}'. "
            f"Write a {word_size}-word thematically focused and critical paragraph for a scientific review on '{subheading}'. "
            "please do the following:\n"
            "1.Begin the paragraph with 100-word sentences that summarize the main findings and objectives of the following studies, providing a clear context for the discussion.You may supplement this introduction with additional relevant knowledge to enhance understanding."
            "2.Before introducing each piece of literature, you need to come up with a sentence or conjunction that connects the context"
            "3.For each study, provide a overview, analyzing its objectives, methodologies, findings, and broader significance. "
            "Ensure that the analysis of each study is presented in sequence, without skipping any, and maintain a logical flow."
            "4.Relevant literature should be critically discussed, highlighting how it contributes to the field and emphasizing its strengths and limitations. "
            "5.After discussing all studies, provide a concluding paragraph that offers a deep analysis of the collective progress represented by the studies, "
            "identifying overarching trends, advancements, and gaps. Conclude with insightful suggestions for future directions and research areas that need further exploration. "
            "please Meet the following requirements:\n"
            "1.Maintain clear academic language in the style of *Nature*, with a focus on the relationships between studies and their contributions to the subheading's topic. "
            "2.Ensure in-text citations are included in the format [Ref: number], avoid repetition, and provide a critical, objective comparison where relevant. "
            "3.The entire paragraph should be coherent, without empty lines between studies, and flow logically from one point to the next. Each study must be fully represented,with no omission or skipping.\n "
            "4.To prevent the simple stacking of literature, you need to think about how to make the article more readable, logical, and professional."
            # f"Summaries:{' '.join(summaries_text)}"
            f"Summaries:{' '.join(s.strip() for s in summaries_text)}"
            "Output format:[Write paragraph here]"
        )
        paragraph_prompts.append(paragraph_prompt)

    paragraph_responses = await asyncio.gather(
        *(chat_func(para_prompt)
          for para_prompt in paragraph_prompts)
    )
    for subheading, paragraph_response in \
            zip(subheadings, paragraph_responses):
        paragraph_text = paragraph_response.choices[0].message.content.strip()
        paragraph_text = re.sub(r'\(Ref:\s*(\d+)\)', r'[Ref: \1]', paragraph_text)
        paragraph_text = re.sub(r'\n\s*\n', '\n', paragraph_text)
        paragraph_text = paragraph_text.replace('\n', ' ')
        paragraph = f"**{subheading}**\n{paragraph_text}\n"
        paragraphs.append(paragraph)

    # Conclusion
    conclusion_prompt = (
        f"Write a concluding paragraph for a scientific review on '{main_topic}'. Summarize the main points discussed in the previous sections, "
        "highlight the significance of the research, and suggest possible future directions or applications.\n\n"
        "Output format:\n[Write conclusion here]"
    )
    conclusion_response = await chat_func(conclusion_prompt)
    conclusion_paragraph = conclusion_response.choices[0].message.content.strip()
    paragraphs.append(f"**Conclusion**\n{conclusion_paragraph}\n")

    used_references = final_relevant_papers_df[
        ['Title', 'First Author', 'Journal Title','Publication Date', 'ref_index']
    ].sort_values(by='ref_index')

    # References section (only used references)
    references = "\n".join([
        f"[Ref:{idx}]. {author} et al. {title}{Journal_Title}({pub_date})."
        for idx, (author, title, Journal_Title, pub_date, ref_index) 
            in enumerate(used_references[
                ['First Author','Title', 'Journal Title', 'Publication Date', 'ref_index']
            ].values, 1
        )
    ])
    paragraphs.append(f"**References**\n{references}")

    # Compile paragraphs into final content
    final_content = "\n".join(paragraphs)

    # Save grouped summaries to CSV with customer_name and current date
    prefix = f"{customer_name}/{uuid}/{model_name}/"
    output_dir = prefix

    review_file = os.path.join(output_dir, f"review_non_refined.txt")

    await upload_text_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=review_file,
        file_content=final_content
    )
    
    logger.info(f"Non-refined review saved to {review_file}")
    return final_content


# Function to enhance language and readability to meet Nature journal style
async def enhance_language_readability(content, chat_func):
    """
    Enhance the language and readability of the given content to meet the style of the *Nature* journal.
    
    Args:
        content (str): The content to enhance.
        chat_func (function): The function to use for the chat completion.
        
    Returns:
        str: The enhanced content.
    
    """
    # Separate sections based on paragraph breaks
    sections = content.split("\n\n")
    enhanced_sections = []
    prompts = []
    for section in sections:
        prompt = (
            "Enhance the following text to align with the writing style of *Nature* journal. Refine language to be sophisticated and objective, "
            "using advanced vocabulary and a factual tone. Ensure a high level of lexical diversity and rhythm, with alternating sentence lengths "
            "and varied structures for readability. Avoid emotional, speculative, or conversational language, focusing on objective analysis.\n\n"
            f"Text:\n{section}\n\n"
            "Output format:\n[Enhanced text here]"
        )
        prompts.append(prompt)

    responses = await asyncio.gather(
        *(chat_func(prompt) for prompt in prompts)
    )
    for response in responses:
        enhanced_section = response.choices[0].message.content.strip()
        enhanced_sections.append(enhanced_section)

    return "\n\n".join(enhanced_sections)


async def split_by_section(content):
    """
    Split the given content into sections based on paragraph breaks.

    Args:
        content (str): The content to split.

    Returns:
        list: The list of sections.

    """
    # Split the content into sections based on paragraph breaks
    subheading_pattern = r"(?m)^\*\*(.*?)\*\*$"
    matches = list(re.finditer(subheading_pattern, content))
    
    sections = []
    references_found = False
    for i, match in enumerate(matches):
        subheading = match.group(1).strip()  # Get the subheading text
        if subheading.lower() == "references":
            references_found = True
        
        start = match.end()  # End of the subheading line
        end = matches[i + 1].start() if i + 1 < len(matches) else len(content)
        paragraph_text = content[start:end].strip()
        
        if references_found:  # Add everything under "References" as is
            sections.append((subheading, paragraph_text))
            break  # Stop further processing
        
        sections.append((subheading, paragraph_text))
    
    return sections


async def process_sections(sections, chat_func):
    """
    Processes each section (subheading and corresponding text) through the AI model.
    Skips processing the "Introduction", "Conclusion", and "References" sections.
    """
    refined_sections = []
    seen_subheadings = set()
    skip_subheadings = {"introduction", "conclusion", "references"}  # Sections to skip

    prompts = []
    for idx, (subheading, text) in enumerate(sections):
        subheading_clean = subheading.strip("*").strip()
        logger.info(f"Processing section {idx + 1} of {len(sections)}: {subheading_clean}")
        
        if subheading_clean.lower() in skip_subheadings:
            logger.info(f"Skipping '{subheading_clean}' section.")
            # refined_sections.append((subheading, text))  # Keep these sections as is
            continue
        
        if subheading_clean in seen_subheadings:
            logger.info(f"Duplicate subheading detected: {subheading_clean}. Skipping.")
            continue
        
        seen_subheadings.add(subheading_clean)
        if text.strip():  # Skip empty sections
            # Remove extra newlines and ensure no empty lines in the text
            text = re.sub(r'\n\s*\n', ' ', text)  # Replace multiple newlines with a single space
            text = text.replace('\n', ' ')        # Replace remaining newlines with spaces
            text = re.sub(r'\s+', ' ', text).strip()  # Ensure no extra spaces
            
            # Updated prompt for higher review quality
            prompt = textwrap.dedent(f"""
            Your task is to refine the following academic section for clarity, depth, and suitability for publication in a high-impact journal.

            Please adhere to these guidelines:

            **1. Structure and Organization:**
            - Identify and emphasize key themes or topics within the section.
            - Group related studies together to enhance coherence and logical flow.
            - Reorganize the content to ensure a clear progression of ideas.
            - Use smooth transitions to connect paragraphs and concepts without relying on explicit subheadings.

            **2. Integration and Analysis of Literature:**
            - Synthesize findings from cited studies, highlighting connections, similarities, and differences.
            - Avoid merely listing studies; focus on comparative analysis and critical evaluation.
            - Highlight significant contributions, novel findings, or implications of each study.
            - Discuss any controversies, differing perspectives, or gaps in the current research.

            **3. Depth and Critical Insight:**
            - Deepen analytical insights by going beyond surface-level summarization.
            - Provide critical evaluations, discussing strengths, limitations, and areas needing further exploration.
            - Highlight the significance of trends or shifts in the field.

            **4. Language and Clarity:**
            - Use precise and concise language appropriate for an academic audience.
            - Vary sentence structures to enhance readability and engagement.
            - Eliminate redundant or repetitive statements to streamline the content.
            - Maintain a formal academic tone while ensuring the text is accessible.

            **5. Consistency and Terminology:**
            - Ensure consistency in terminology, style, and formatting throughout the section.
            - Use technical terms accurately and define specialized terms if necessary.
            - Avoid unnecessary acronyms unless commonly understood in the field.

            **6. Accuracy and Detail:**
            - Verify that descriptions of studies are accurate and that key findings are correctly represented.
            - Emphasize the most relevant and impactful information from each study.
            - Provide context where needed to aid understanding for a multidisciplinary audience.

            **7. Conclusion and Future Directions:**
            - Summarize main points and discuss how findings align or diverge from prior work.
            - Suggest areas for future research based on identified gaps or limitations.
            - Discuss practical implications or potential applications if relevant.

            **8. Citation and Formatting:**
            - Ensure citations are formatted accurately (e.g., [Ref: number]) and integrated smoothly into the text.
            - Do not alter the "References" section or the citation order.
            - Maintain the existing citation positions within the text.

            **Section to refine:**
            {text}
            """)

            prompts.append(prompt)
    
    # Call the AI model with the updated prompt
    index = 0
    refined_texts = await asyncio.gather(
        *(chat_func(prompt) for prompt in prompts)
    )
    
    logger.info(len(refined_texts))
    logger.info(len(prompts))
    
    seen_subheadings = set()
    for idx, (subheading, text) in enumerate(sections):
        subheading_clean = subheading.strip("*").strip()
        logger.info(f"Processing section {idx + 1} of {len(sections)}: {subheading_clean}")
        
        if subheading_clean.lower() in skip_subheadings:
            refined_sections.append((subheading, text))
            continue
            
        if subheading_clean in seen_subheadings:
            logger.info(f"Duplicate subheading detected: {subheading_clean}. Skipping.")
            continue
        
        seen_subheadings.add(subheading_clean)
        if text.strip():
            refined_text = refined_texts[index].choices[0].message.content.strip()
            refined_text = re.sub(r'\n\s*\n', ' ', refined_text)  # Replace extra newlines with a single space
            refined_text = refined_text.replace('\n', ' ')        # Replace remaining newlines with spaces
            refined_text = re.sub(r'\s+', ' ', refined_text).strip()  # Ensure no extra spaces
            refined_sections.append((subheading, refined_text))
            index += 1
    
    return refined_sections


async def process_papers(
    dataframe, topic, direction, 
    uuid, customer_name, model_name,
    chat_func
):
    """
    Process the given papers to extract relevant information and save it to a CSV file.
    
    Args:
        dataframe (pandas.DataFrame): The DataFrame containing the papers.
        topic (str): The topic to filter the papers by.
        direction (str): The direction to filter the papers by.
        uuid (str): The UUID of the task.
        customer_name (str): The name of the customer.
        chat_func (function): The function to use for the chat completion.
        
    Returns:
        pandas.DataFrame: The DataFrame containing the relevant papers.
    
    """
    # Duplicate, no need
    # relevant_rows = []  # List to collect relevant rows for DataFrame creation

    # Set up the output directory and CSV file
    # output_dir = os.path.join(customer_name)
    # os.makedirs(output_dir, exist_ok=True)
    prefix = f"{customer_name}/{uuid}/{model_name}/"
    output_dir = prefix

    output_path = os.path.join(output_dir, "relevant_papers.csv")

    # Create or clear the output file at the beginning
    # with open(output_path, 'w', newline='', encoding='utf-8') as f:
    #     writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    #     writer.writerow(["Journal Title", "Publication Date", "Title", "First Author", "Summary", "Is Relevant", "Relevance Keywords"])  # Writing header
    texts = ""
    fieldnames = ["Journal Title", "Publication Date", "Title",
                  "First Author", "Summary", "Is Relevant", "Relevance Keywords"]
    texts += ",".join([escape_csv_field(x) for x in fieldnames]) + "\n"

    titles = []
    abstracts = []
    journal_titles = []
    pubd_dates = []
    first_authors = []
    summaries = []
    for idx, row in dataframe.iterrows():
        title = row["TI"]
        abstract = row["AB"]
        journal_title = row["JT"]
        pub_date = row["DCOM"]
        first_author = row["FAU-frist"]

        titles.append(title)
        abstracts.append(abstract)
        journal_titles.append(journal_title)
        pubd_dates.append(pub_date)
        first_authors.append(first_author)

    relevants = await asyncio.gather(
        *(is_relevant(
            title, abstract, topic, direction, chat_func
        ) for title, abstract in zip(titles, abstracts))
    )

    is_relevant_flags = [relevant[0] for relevant in relevants]
    relevance_keywords = [relevant[1] for relevant in relevants]

    rtitles = []
    rabstracts = []
    rjournal_titles = [] 
    rpubd_dates = []
    rfirst_authors = []
    rflags = []
    rkeywords = []

    for (
        rflag, rkeyword, title, abstarct, first_author, journal_title, pub_date
    ) in zip(
        is_relevant_flags, relevance_keywords,
        titles, abstracts, first_authors, journal_titles, pubd_dates
    ):
        if rflag:
            rtitles.append(title)
            rabstracts.append(abstarct)
            rfirst_authors.append(first_author)
            rjournal_titles.append(journal_title)
            rpubd_dates.append(pub_date)
            rflags.append(rflag)
            rkeywords.append(rkeyword)

    summaries = await asyncio.gather(
        *(summarize_abstract(
            title, abstract, first_author, chat_func
        ) for title, abstract, first_author in
            zip(rtitles, rabstracts, rfirst_authors)
        )
    )

    for (
        summary,
        journal_title, pub_date, title, first_author,
        rflag, rkeyword
    ) in zip(
        summaries,
        rjournal_titles, rpubd_dates, rtitles, rfirst_authors,
        rflags, rkeywords
    ):
        journal_title = escape_csv_field(journal_title)
        pub_date = escape_csv_field(pub_date)
        title = escape_csv_field(title)
        first_author = escape_csv_field(first_author)
        summary = escape_csv_field(summary)
        rkeyword = escape_csv_field(rkeyword)

        texts += ",".join([
            str(x) for x in [
                journal_title, pub_date, title, first_author,
                summary, rflag, rkeyword
            ]
        ]) + "\n"

        # Print the added summary and keywords
        logger.info(f"Added summary: {summary}")
        logger.info(f"Relevance Keywords: {rkeyword}")

    # Create the relevant DataFrame to return
    # relevant_df = pd.DataFrame(relevant_rows)
    # return relevant_df
    await upload_text_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=output_path,
        file_content=texts
    )

    return output_path


async def translate_to_chinese_before_references(
    text, 
    uuid, customer_name, model_name, 
    chat_func
):
    """
    Translates the content of a text file to Chinese, keeping the '**References**' section in English.
    
    Args:
        text (str): The content of the text file.
        output_filename (str): The name of the output file.
        chat_func (function): The function to use for translation.
        
    Returns:
        str: The translated content.
    
    """
    lines = text.split("\n")

    # Step 3: 找到 '**References**' 行的索引
    references_index = None
    for i, line in enumerate(lines):
        if line.strip() == "**References**":
            references_index = i
            break

    # Step 4: 根据找到的索引分割内容
    if references_index is not None:
        main_content_lines = lines[:references_index]
        references_content_lines = lines[references_index:]
    else:
        # 如果没有找到 '**References**',则认为整个内容为正文
        main_content_lines = lines
        references_content_lines = []

    # 将正文内容拼接为一个字符串
    main_content = "\n".join(main_content_lines)

    # Step 5: 分段处理正文内容进行翻译
    sections = main_content.split("\n\n")
    translated_sections = []

    prompts = []
    
    for section in sections:
        # 简化 prompt,只要求翻译正文内容
        prompt = (
            "Translate the following text to academic Chinese:\n\n"
            f"Text:\n{section}\n\n"
            "Output format:\n[Translated Chinese text here]"
        )
        prompts.append(prompt)
    
    responses = await asyncio.gather(
        *(chat_func(prompt) for prompt in prompts)
    )
    for response in responses:
        translated_section = response.choices[0].message.content.strip()
        translated_sections.append(translated_section)

    # Step 6: 将翻译后的正文拼接
    translated_content = "\n\n".join(translated_sections)

    # Step 7: 合并翻译后的正文和 References 部分
    if references_content_lines:
        references_content = "\n".join(references_content_lines)
        final_content = translated_content + "\n\n" + references_content
    else:
        final_content = translated_content

    # Step 8: 保存结果到新的文件
    output_filename = f"{customer_name}/{uuid}/{model_name}/review_non_refined_translated.txt"
    await upload_text_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=output_filename,
        file_content=final_content
    )

    logger.info(f"\nTranslated content saved to {output_filename}")


async def translate_refined_review_to_chinese(
    refined_review_content,
    uuid, customer_name, model_name,
    chat_func
):
    
    # Read the Word document
    doc = Document(refined_review_content)
    
    # Prepare to create a new document for the translated content
    translated_doc = Document()
    
    # Set of subheadings to skip translation
    skip_subheadings = {"references"}
    
    # Keep track of the current section heading
    current_heading = None
    in_references_section = False
    
    prompts = []
    for para in doc.paragraphs:
        # Check if the paragraph is a heading
        if para.style.name.startswith('Heading'):
            # Get the heading text
            current_heading = para.text.strip()
            # Get the heading level
            heading_level_match = re.findall(r'\d+', para.style.name)
            heading_level = int(heading_level_match[0]) if heading_level_match else 1
            
            # Check if the heading text is in skip_subheadings
            if current_heading.lower() in skip_subheadings:
                in_references_section = True
                # Add the heading as is
                # translated_doc.add_heading(current_heading, level=heading_level)
            else:
                in_references_section = False
                # Translate the heading
                prompt = f"Translate the following heading to Chinese:\n\n{current_heading}"
                prompts.append(prompt)
                # translated_heading = chat_func(prompt)
                # Add the translated heading
                # translated_doc.add_heading(translated_heading, level=heading_level)
        else:
            if in_references_section:
                # Add the paragraph as is
                # translated_doc.add_paragraph(para.text)
                pass
            else:
                # Translate the paragraph text to Chinese, preserving in-text citations
                text_to_translate = para.text
                if text_to_translate.strip() == '':
                    # If the paragraph is empty, skip translation
                    translated_doc.add_paragraph('')
                else:
                    # We need to preserve in-text citations, e.g., [Ref: 38]
                    # Instruct the AI to keep the in-text citations in English
                    prompt = f"""
                    Translate the following text to academic Chinese. Keep any in-text citations (e.g., [Ref: number]) in English.

                    Text:
                    {text_to_translate}
                    """
                    prompts.append(prompt)
    
    translated_texts = await asyncio.gather(
        *(chat_func(prompt) for prompt in prompts)
    )
    translated_texts = [
        t.choices[0].message.content.strip() for t in translated_texts
    ]
    
    index = 0
    for para in doc.paragraphs:
        # Check if the paragraph is a heading
        if para.style.name.startswith('Heading'):
            # Get the heading text
            current_heading = para.text.strip()
            # Get the heading level
            heading_level_match = re.findall(r'\d+', para.style.name)
            heading_level = int(heading_level_match[0]) if heading_level_match else 1
            
            # Check if the heading text is in skip_subheadings
            if current_heading.lower() in skip_subheadings:
                in_references_section = True
                # Add the heading as is
                translated_doc.add_heading(current_heading, level=heading_level)
            else:
                in_references_section = False
                translated_doc.add_heading(translated_texts[index], level=heading_level)
                index += 1
        else:
            if in_references_section:
                # Add the paragraph as is
                translated_doc.add_paragraph(para.text)
            else:
                # Translate the paragraph text to Chinese, preserving in-text citations
                text_to_translate = para.text
                if text_to_translate.strip() == '':
                    # If the paragraph is empty, skip translation
                    translated_doc.add_paragraph('')
                else:
                    translated_text = translated_texts[index]
                    translated_doc.add_paragraph(translated_text)
                    index += 1
    
    output_file_path = f"{customer_name}/{uuid}/{model_name}/review_paper_refined_translated.docx"
    await upload_document_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=output_file_path,
        document=translated_doc
    )
    return output_file_path


async def refine_review_content(
    non_refine_content,
    uuid, customer_name, model_name,
    chat_func
):
    sections = await split_by_section(non_refine_content)
    refined_sections = await process_sections(sections, chat_func)
    
    prompt_title = f"""
    Based on the following literature review, generate an appropriate and concise title:
    {non_refine_content}
    """
    title = await chat_func(prompt_title)
    title = title.choices[0].message.content.strip()
    logger.info(f"Generated Title: {title}")
    
    doc = Document()
    doc.add_heading(title, level=1)
    
    for subheading, content in refined_sections:
        doc.add_heading(subheading, level=2)
        doc.add_paragraph(content)
    
    output_file = f"{customer_name}/{uuid}/{model_name}/review_paper_refined.docx"
    await upload_document_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=output_file,
        document=doc
    )
    return output_file


# Main function to automate the review paper creation process with language enhancement step
async def create_review_paper(
    relevant_papers_df,
    main_topic,
    uuid, customer_name, model_name,
    chat_func,
    translate_to_cn=False,
    do_refine=False,
):
    """
    Main function to automate the review paper creation process with language enhancement step.
    
    Args:
        relevant_papers_df (pd.DataFrame): DataFrame containing relevant papers.
        main_topic (str): Main topic of the review paper.
        uuid (str): Unique identifier for the review paper.
        customer_name (str): Name of the customer.
        chat_func (function): Function to handle chat interactions.
        translate_to_cn (bool): Flag to indicate if translation to Chinese is required.
    
    Returns:
        None
    
    """
    
    # Step 1: Generate subheadings related to the main topic
    subheadings = await generate_subheadings(
        relevant_papers_df, main_topic,
        chat_func
    )

    # Step 2: Assign each summary to a subheading
    relevant_papers_df = await assign_subheadings_to_summaries(
        relevant_papers_df, subheadings, 
        uuid, customer_name, model_name,
        chat_func
    )

    # Step 3: Create paragraphs by subheading, with introductory and concluding sections, and references
    review_content = await create_paragraphs_by_subheading(
        relevant_papers_df, subheadings, main_topic, 
        uuid, customer_name, model_name,
        chat_func
    )

    output_filename = f"{customer_name}/{uuid}/{model_name}/review_non_refined.txt"
    
    if do_refine:
        # Step 4: Refine Review Content to a Word Document
        await refine_review_content(
            review_content,
            uuid, customer_name, model_name,
            chat_func
        )
        refined_review_content = await get_file_from_minio(
            bucket_name=BUCKET_NAME,
            object_name=f"{customer_name}/{uuid}/{model_name}/review_paper_refined.docx",
        )
        refined_review_content = io.BytesIO(refined_review_content.data)
        
    if translate_to_cn:
        if do_refine:
            await translate_refined_review_to_chinese(
                refined_review_content,
                uuid, customer_name, model_name,
                chat_func
            )
            output_filename = f"{customer_name}/{uuid}/{model_name}/review_paper_refined_translated.txt"
        else:
            await translate_to_chinese_before_references(
                review_content,
                uuid, customer_name, model_name,
                chat_func
            )
        output_filename = f"{customer_name}/{uuid}/{model_name}/review_non_refined_translated.txt"
    return output_filename