File size: 25,255 Bytes
163107e
 
 
 
 
 
1d00bb1
163107e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
import os
import math
import asyncio

from loguru import logger

from .r2_utils import (
    upload_text_to_minio,
    upload_dataframe_to_minio,
)
from .common_utils import escape_csv_field


BUCKET_NAME = "ai-scientist"


# Function to check relevance and obtain keywords as reason
async def is_relevant(title, abstract, topic, direction, chat_func):
    """ 
    Check if a paper is relevant to a topic and obtain keywords as reason.
    
    Args:
        title (str): Title of the paper.
        abstract (str): Abstract of the paper.
        topic (str): Topic to check relevance against.
        direction (str): Direction to check relevance against.
        chat_func (function): Function to call the chat model.

    Returns:
        bool: True if the paper is relevant, False otherwise.
        str: Keywords that indicate relevance.
    
    """
    relevance_prompt = (
        f"You are an academic expert in {topic}. Identify if the following paper is "
        f"related to '{direction}' and list only the main keywords that indicate relevance:\n\n"
        f"Title: {title}\nAbstract: {abstract}\n\n"
        "Answer format:\n"
        "Relevance: True or False\n"
        "Keywords: [Comma-separated keywords]"
    )
    response = await chat_func(relevance_prompt)
    if response is None:
        return False, "Relevance check unavailable due to server error."

    try:
        response_text = response.choices[0].message.content
        relevance = "True" in response_text
        keywords = response_text.split(
            "Keywords:")[-1].strip() if "Keywords:" in response_text else ""
        return relevance, keywords
    except AttributeError:
        logger.error("Error in chat_func response format:", response)
        return False, "Relevance check failed"


# Modified summarize_abstract function with error handling for failed completion requests
async def summarize_abstract(title, abstract, first_author, chat_func):
    """
    Summarize the abstract of a research paper.
    
    Args:
        title (str): Title of the paper.
        abstract (str): Abstract of the paper.
        first_author (str): Name of the first author.
        chat_func (function): Function to call the chat model.

    Returns:
        str: Summary of the abstract.
    
    """
    formatted_author = reformat_author_name(first_author)
    summary_prompt = (
        f"Write a concise, high-level summary in 2-3 sentences, highlighting the study's "
        f"purpose, specific methodology, main findings, and significance. Avoid generalizing "
        f"or replacing specific method names or entities with vague language. Retain concrete terms "
        f"and clear descriptions of methodology and findings.\n\n"
        f"Title: {title}\nAbstract: {abstract}\n\n"
        f"Summary by {formatted_author} et al.:"
    )

    response = await chat_func(summary_prompt)
    if response is None:
        return "Summary unavailable due to server error."

    try:
        result = response.choices[0].message.content
        result_words = result.split()
        summary = " ".join(result_words)
        return summary
    except AttributeError:
        logger.error("Error in chat_func response format:", response)
        return "Summary unavailable"


# Function to reformat first author name
def reformat_author_name(author_name):
    """
    Reformat the first author name by removing commas.
    
    Args:
        author_name (str): Name of the first author.
        
    Returns:
        str: Reformatted name of the first author.
        
    """
    try:
        return author_name.replace(",", "")
    except AttributeError:
        return "Unknown Author"


# Function to generate 3-5 hierarchical subheadings related to the main topic
async def generate_subheadings(
    relevant_papers_df, main_topic, 
    uuid, customer_name, model_name,
    chat_func
):
    """
    Generate 3-5 hierarchical subheadings related to the main topic based on the summaries of relevant papers.
    
    Args:
        relevant_papers_df: DataFrame containing relevant papers.
        main_topic: Main topic of the research.
        chat_func: Function to send chat messages to the chatbot.
        
    Returns:
        List[str]: List of generated subheadings.
    
    """
    summaries = " ".join(relevant_papers_df['Summary'].tolist())
    prompt = (
        f"The main topic is '{main_topic}'. Based on this topic and the following summaries from relevant research papers, "
        "generate 3-5 hierarchical subheadings that progressively explore the topic. Begin with broader subheadings and "
        "move towards more specific themes, avoiding overlap in scope or content. Subheadings should be distinct and arranged "
        "in a logical order suitable for a structured review.\n\n"
        f"Summaries:\n{summaries}\n\n"
        "Output format:\n- Subheading 1\n- Subheading 2\n- Subheading 3\n..."
    )
    response = await chat_func(prompt)
    subheadings = response.choices[0].message.content.strip().splitlines()
    logger.info("Generated Subheadings:\n" + "\n".join(subheadings))
    
    output_filename = f"{customer_name}/{uuid}/{model_name}/generated_subheadings.txt"
    await upload_text_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=output_filename,
        file_content="\n".join(subheadings)
    )
    logger.info(f"Subheadings saved to {output_filename}")
    return subheadings


# Function to assign summaries to subheadings with minimum allocation of references per subheading
async def assign_subheadings_to_summaries(
    relevant_papers_df, 
    subheadings, 
    uuid, customer_name, model_name,
    chat_func
):
    """
    Assign summaries to subheadings with minimum allocation of references per subheading.
    
    Args:
        relevant_papers_df: DataFrame containing relevant papers.
        subheadings: List of subheadings.
        uuid: Unique identifier for the task.
        customer_name: Name of the customer.
        chat_func: Function to send chat messages to the chatbot.

    Returns:
        DataFrame with assigned subheadings.
    
    """
    total_papers = len(relevant_papers_df)
    min_papers_per_subheading = math.ceil(
        total_papers / (len(subheadings) + 1))

    assigned_subheadings = []
    prompts = []
    for summary in relevant_papers_df['Summary']:
        prompt = (
            "Given the following subheadings and a research paper summary, determine the most appropriate subheading "
            "for this summary. Each subheading should cover a unique aspect of the main topic without overlap. "
            "Select the best-fitting subheading based on thematic relevance and coherence with similar studies.\n\n"
            f"Subheadings:\n{subheadings}\n\n"
            f"Summary:\n{summary}\n\n"
            "Output format:\nSubheading: [Chosen subheading]"
        )
        prompts.append(prompt)
    responses = await asyncio.gather(
        *(chat_func(prompt) for prompt in prompts)
    )
    for response in responses:
        assigned_subheading = response.choices[0].message.content.split(": ")[1]
        assigned_subheadings.append(assigned_subheading)

    relevant_papers_df['Assigned Subheading'] = assigned_subheadings

    # Ensure minimum papers per subheading
    counts = relevant_papers_df['Assigned Subheading'].value_counts().to_dict()
    for subheading in subheadings:
        if counts.get(subheading, 0) < min_papers_per_subheading:
            extra_summaries = relevant_papers_df[relevant_papers_df['Assigned Subheading'] != subheading].sample(
                min_papers_per_subheading - counts.get(subheading, 0)
            )
            relevant_papers_df.loc[extra_summaries.index,
                                   'Assigned Subheading'] = subheading

    prefix = f"{customer_name}/{uuid}/{model_name}/"
    output_dir = prefix

    csv_filename = os.path.join(output_dir, f"assigned_subheadings.csv")

    # relevant_papers_df.to_csv(csv_filename, index=False, encoding='utf-8')
    await upload_dataframe_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=csv_filename,
        df=relevant_papers_df,
    )

    logger.info(f"Assigned subheadings saved to {csv_filename}")
    logger.info(f"Found {len(relevant_papers_df)} related papers")

    return relevant_papers_df


# Function to create expanded paragraphs with required reference count and consistent reference indexing
async def create_paragraphs_by_subheading(
    relevant_papers_df, subheadings, main_topic, 
    uuid, customer_name, model_name,
    chat_func
):
    """
    Create expanded paragraphs by subheading with required reference count and consistent reference indexing.
    
    Args:
        relevant_papers_df (pd.DataFrame): DataFrame containing relevant papers and their summaries.
        subheadings (list): List of subheadings for the review paper.
        main_topic (str): Main topic of the review paper.
        uuid (str): UUID of the task.
        customer_name (str): Name of the customer.
        chat_func (function): Function to send chat messages to the chatbot.
        
    Returns:
        list: List of paragraphs with subheadings and consistent reference indexing.
    
    """
    paragraphs = []

    # Introduction
    intro_prompt = (
        f"Write a concise and advanced introductory paragraph for a scientific review paper on '{main_topic}'. "
        "Introduce the topic, its importance, and the scope of the review. The introduction should provide a logical "
        "setup for the following subheadings.\n\n"
        "Output format:\n[Write introduction here]"
    )
    intro_response = await chat_func(intro_prompt)
    intro_paragraph = intro_response.choices[0].message.content.strip()
    paragraphs.append(f"**Introduction**\n{intro_paragraph}\n")

    # Body paragraphs based on subheadings with consistent reference numbering
    reference_map = {}
    used_references = []
    total_papers = len(relevant_papers_df)
    min_papers_per_subheading = math.ceil(
        total_papers / (len(subheadings) + 1))
    ref_counter = 1

    paragraph_prompts = []
    for subheading in subheadings:
        relevant_summaries = relevant_papers_df[relevant_papers_df['Assigned Subheading'] == subheading]

        new_references = []
        summaries_text = []
        for idx, (summary, title, author, pub_date) in relevant_summaries[['Summary', 'Title', 'First Author', 'Publication Date']].iterrows():
            if title not in reference_map:
                reference_map[title] = ref_counter
                ref_counter += 1
            ref_index = reference_map[title]
            summaries_text.append(f"{summary} [Ref: {ref_index}]")
            new_references.append((title, author, pub_date))

        # Compose prompt to generate an extended paragraph with at least 800 words
        paragraph_prompt = (
            f"Write an 800-word thematic and critical paragraph under the subheading '{subheading}' for a scientific review on '{main_topic}'. "
            f"Combine the following summaries into a coherent, well-structured paragraph discussing the studies’ objectives, findings, "
            "and methodologies. Use advanced academic language, include in-text citations in the format [Ref: number], and avoid repeating "
            "content from previous sections. Provide critical insights and comparative analysis where relevant.\n\n"
            f"Summaries:\n{' '.join(summaries_text)}\n\n"
            "Output format:\n[Write paragraph here]"
        )

        paragraph_prompts.append(paragraph_prompt)
        used_references.extend(new_references)

    paragraph_responses = await asyncio.gather(
        *(chat_func(para_prompt)
          for para_prompt in paragraph_prompts)
    )
    for subheading, paragraph_response in \
            zip(subheadings, paragraph_responses):
        paragraph = f"**{subheading}**\n{paragraph_response.choices[0].message.content.strip()}\n"
        paragraphs.append(paragraph)

    # Conclusion
    conclusion_prompt = (
        f"Write a concluding paragraph for a scientific review on '{main_topic}'. Summarize the main points discussed in the previous sections, "
        "highlight the significance of the research, and suggest possible future directions or applications.\n\n"
        "Output format:\n[Write conclusion here]"
    )
    conclusion_response = await chat_func(conclusion_prompt)
    conclusion_paragraph = conclusion_response.choices[0].message.content.strip()
    paragraphs.append(f"**Conclusion**\n{conclusion_paragraph}\n")

    # References section (only used references)
    references = "\n".join(
        [f"[Ref: {reference_map[title]}] {title}, {author}, {pub_date}" 
         for title, author, pub_date in used_references]
    )
    paragraphs.append(f"**References**\n{references}")

    # Compile paragraphs into final content
    final_content = "\n\n".join(paragraphs)

    # Save grouped summaries to CSV with customer_name and current date
    prefix = f"{customer_name}/{uuid}/{model_name}/"
    output_dir = prefix

    csv_filename = os.path.join(output_dir, f"grouped_summaries.csv")
    output_filename = os.path.join(output_dir, f"review_non_refined.txt")
    # Prepare data for CSV
    grouped_data = relevant_papers_df[['Assigned Subheading', 'Summary']]
    # grouped_data.to_csv(csv_filename, index=False, encoding='utf-8')
    await upload_dataframe_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=csv_filename,
        df=grouped_data
    )
    
    await upload_text_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=output_filename,
        file_content=final_content
    )

    logger.info(f"\nGrouped summaries saved to {csv_filename}")
    logger.info(f"Non-refined review saved to {output_filename}")
    return final_content


# Function to enhance language and readability to meet Nature journal style
async def enhance_language_readability(
    content, 
    uuid, customer_name, model_name,
    chat_func
):
    """
    Enhance the language and readability of the given content to meet the style of the *Nature* journal.
    
    Args:
        content (str): The content to enhance.
        chat_func (function): The function to use for the chat completion.
        
    Returns:
        str: The enhanced content.
    
    """
    # Separate sections based on paragraph breaks
    sections = content.split("\n\n")
    enhanced_sections = []
    prompts = []
    for section in sections:
        prompt = (
            "Enhance the following text to align with the writing style of *Nature* journal. Refine language to be sophisticated and objective, "
            "using advanced vocabulary and a factual tone. Ensure a high level of lexical diversity and rhythm, with alternating sentence lengths "
            "and varied structures for readability. Avoid emotional, speculative, or conversational language, focusing on objective analysis.\n\n"
            f"Text:\n{section}\n\n"
            "Output format:\n[Enhanced text here]"
        )
        prompts.append(prompt)

    responses = await asyncio.gather(
        *(chat_func(prompt) for prompt in prompts)
    )
    for response in responses:
        enhanced_section = response.choices[0].message.content.strip()
        enhanced_sections.append(enhanced_section)

    enhanced_content = "\n\n".join(enhanced_sections)
    await upload_text_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=f"{customer_name}/{uuid}/{model_name}/review_paper.txt",
        file_content=enhanced_content
    )
    
    return enhanced_content


async def process_papers(
    dataframe, topic, direction, 
    uuid, customer_name, model_name,
    chat_func
):
    """
    Process the given papers to extract relevant information and save it to a CSV file.
    
    Args:
        dataframe (pandas.DataFrame): The DataFrame containing the papers.
        topic (str): The topic to filter the papers by.
        direction (str): The direction to filter the papers by.
        uuid (str): The UUID of the task.
        customer_name (str): The name of the customer.
        chat_func (function): The function to use for the chat completion.
        
    Returns:
        pandas.DataFrame: The DataFrame containing the relevant papers.
    
    """
    # Duplicate, no need
    # relevant_rows = []  # List to collect relevant rows for DataFrame creation

    # Set up the output directory and CSV file
    # output_dir = os.path.join(customer_name)
    # os.makedirs(output_dir, exist_ok=True)
    prefix = f"{customer_name}/{uuid}/{model_name}/"
    output_dir = prefix

    output_path = os.path.join(output_dir, "relevant_papers.csv")

    # Create or clear the output file at the beginning
    # with open(output_path, 'w', newline='', encoding='utf-8') as f:
    #     writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    #     writer.writerow(["Journal Title", "Publication Date", "Title", "First Author", "Summary", "Is Relevant", "Relevance Keywords"])  # Writing header
    texts = ""
    fieldnames = ["Journal Title", "Publication Date", "Title",
                  "First Author", "Summary", "Is Relevant", "Relevance Keywords"]
    texts += ",".join([escape_csv_field(x) for x in fieldnames]) + "\n"

    titles = []
    abstracts = []
    journal_titles = []
    pubd_dates = []
    first_authors = []
    summaries = []
    for idx, row in dataframe.iterrows():
        title = row["TI"]
        abstract = row["AB"]
        journal_title = row["JT"]
        pub_date = row["DCOM"]
        first_author = row["FAU-frist"]

        titles.append(title)
        abstracts.append(abstract)
        journal_titles.append(journal_title)
        pubd_dates.append(pub_date)
        first_authors.append(first_author)

    relevants = await asyncio.gather(
        *(is_relevant(
            title, abstract, topic, direction, chat_func
        ) for title, abstract in zip(titles, abstracts))
    )

    is_relevant_flags = [relevant[0] for relevant in relevants]
    relevance_keywords = [relevant[1] for relevant in relevants]

    rtitles = []
    rabstracts = []
    rjournal_titles = [] 
    rpubd_dates = []
    rfirst_authors = []
    rflags = []
    rkeywords = []

    for (
        rflag, rkeyword, title, abstarct, first_author, journal_title, pub_date
    ) in zip(
        is_relevant_flags, relevance_keywords,
        titles, abstracts, first_authors, journal_titles, pubd_dates
    ):
        if rflag:
            rtitles.append(title)
            rabstracts.append(abstarct)
            rfirst_authors.append(first_author)
            rjournal_titles.append(journal_title)
            rpubd_dates.append(pub_date)
            rflags.append(rflag)
            rkeywords.append(rkeyword)

    summaries = await asyncio.gather(
        *(summarize_abstract(
            title, abstract, first_author, chat_func
        ) for title, abstract, first_author in
            zip(rtitles, rabstracts, rfirst_authors)
        )
    )

    for (
        summary,
        journal_title, pub_date, title, first_author,
        rflag, rkeyword
    ) in zip(
        summaries,
        rjournal_titles, rpubd_dates, rtitles, rfirst_authors,
        rflags, rkeywords
    ):
        journal_title = escape_csv_field(journal_title)
        pub_date = escape_csv_field(pub_date)
        title = escape_csv_field(title)
        first_author = escape_csv_field(first_author)
        summary = escape_csv_field(summary)
        rkeyword = escape_csv_field(rkeyword)

        texts += ",".join([
            str(x) for x in [
                journal_title, pub_date, title, first_author,
                summary, rflag, rkeyword
            ]
        ]) + "\n"

        # Print the added summary and keywords
        logger.info(f"Added summary: {summary}")
        logger.info(f"Relevance Keywords: {rkeyword}")

    # Create the relevant DataFrame to return
    # relevant_df = pd.DataFrame(relevant_rows)
    # return relevant_df
    await upload_text_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=output_path,
        file_content=texts
    )

    return output_path


async def translate_to_chinese_before_references(
    text,
    uuid, customer_name, model_name,
    chat_func
):
    """
    Translates the content of a text file to Chinese, keeping the '**References**' section in English.
    
    Args:
        text (str): The content of the text file.
        output_filename (str): The name of the output file.
        chat_func (function): The function to use for translation.
        
    Returns:
        str: The translated content.
    
    """
    lines = text.split("\n")

    # Step 3: 找到 '**References**' 行的索引
    references_index = None
    for i, line in enumerate(lines):
        if line.strip() == "**References**":
            references_index = i
            break

    # Step 4: 根据找到的索引分割内容
    if references_index is not None:
        main_content_lines = lines[:references_index]
        references_content_lines = lines[references_index:]
    else:
        # 如果没有找到 '**References**',则认为整个内容为正文
        main_content_lines = lines
        references_content_lines = []

    # 将正文内容拼接为一个字符串
    main_content = "\n".join(main_content_lines)

    # Step 5: 分段处理正文内容进行翻译
    sections = main_content.split("\n\n")
    translated_sections = []

    prompts = []
    
    for section in sections:
        # 简化 prompt,只要求翻译正文内容
        prompt = (
            "Translate the following text to academic Chinese:\n\n"
            f"Text:\n{section}\n\n"
            "Output format:\n[Translated Chinese text here]"
        )
        prompts.append(prompt)
    
    responses = await asyncio.gather(
        *(chat_func(prompt) for prompt in prompts)
    )
    for response in responses:
        translated_section = response.choices[0].message.content.strip()
        translated_sections.append(translated_section)

    # Step 6: 将翻译后的正文拼接
    translated_content = "\n\n".join(translated_sections)

    # Step 7: 合并翻译后的正文和 References 部分
    if references_content_lines:
        references_content = "\n".join(references_content_lines)
        final_content = translated_content + "\n\n" + references_content
    else:
        final_content = translated_content

    # Step 8: 保存结果到新的文件
    output_filename = f"{customer_name}/{uuid}/{model_name}/review_paper_translated.txt"
    await upload_text_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=output_filename,
        file_content=final_content
    )

    logger.info(f"\nTranslated content saved to {output_filename}")
    return output_filename


# Main function to automate the review paper creation process with language enhancement step
async def create_review_paper(
    relevant_papers_df,
    main_topic,
    uuid, customer_name, model_name,
    chat_func,
    translate_to_cn=False
):
    """
    Main function to automate the review paper creation process with language enhancement step.
    
    Args:
        relevant_papers_df (pd.DataFrame): DataFrame containing relevant papers.
        main_topic (str): Main topic of the review paper.
        uuid (str): Unique identifier for the review paper.
        customer_name (str): Name of the customer.
        chat_func (function): Function to handle chat interactions.
        translate_to_cn (bool): Flag to indicate if translation to Chinese is required.
    
    Returns:
        None
    
    """
    
    # Step 1: Generate subheadings related to the main topic
    subheadings = await generate_subheadings(
        relevant_papers_df, main_topic,
        chat_func
    )

    # Step 2: Assign each summary to a subheading
    relevant_papers_df = await assign_subheadings_to_summaries(
        relevant_papers_df, subheadings, 
        uuid, customer_name, model_name,
        chat_func
    )

    # Step 3: Create paragraphs by subheading, with introductory and concluding sections, and references
    review_content = await create_paragraphs_by_subheading(
        relevant_papers_df, subheadings, main_topic, 
        uuid, customer_name, model_name,
        chat_func
    )

    # Step 4: Enhance language and readability
    enhanced_content = await enhance_language_readability(
        review_content,
        chat_func
    )

    prefix = f"{customer_name}/{uuid}/{model_name}/"
    output_dir = prefix

    output_filename = os.path.join(output_dir, "review_paper.txt")

    # Step: Translate to Chinese
    if translate_to_cn:
        await translate_to_chinese_before_references(
            enhanced_content,
            output_filename.replace(".txt", "_cn.txt"),
            chat_func
        )

    # Step 6: Save the generated content to a text file
    # with open(output_filename, "w", encoding="utf-8") as f:
    #     f.write(enhanced_content)
    await upload_text_to_minio(
        bucket_name=BUCKET_NAME,
        object_name=output_filename,
        file_content=enhanced_content
    )

    logger.info(f"\nReview paper saved to {output_filename}")
    return output_filename