File size: 7,256 Bytes
463d8cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import os
import pdfplumber
import pandas as pd
import re
from transformers import pipeline
import pandas as pd


# Load the Hugging Face token from the environment
token = os.getenv("HUGGINGFACE_TOKEN")

# Initialize the pipeline with the token
pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf", use_auth_token=token, device_map="auto")


# Function to extract text from PDF using pdfplumber
def extract_text_from_pdf(pdf_file_path):
    text = ""
    with pdfplumber.open(pdf_file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()  # Extract text from each page
    return text

# Define the function to structure the model output into required fields
def structure_summary_output(text):
    global pipe
    prompt = (
        f"Please summarize the following information from the academic paper:\n"
        f"1. Context: Specify whether the study is focused on a specific industry, task or a broader, conceptual scope.\n"
        f"2. Research Question and Findings: Identify the main research question and summarise the key findings.\n"
        f"3. Theme of Research:\n"
        f"   - Human vs. AI: Highlight any comparative advantages between humans and AI.\n"
        f"   - Human + AI Collaboration: Indicate the type of collaboration discussed.\n"
        f"   Note that the output for this field can only be either 'Human vs. AI' or 'Human + AI Collaboration'\n"
        f"4. Method: Classify the study method as one of the following:\n"
        f"   - Conceptual/Case Study\n"
        f"   - Modeling: Either Stylized Modeling or Operations Research (OR) Model\n"
        f"   - Empirical Study: Lab/Field Experiment or Secondary Data Analysis\n"
        f"   Note that the output for this field can only be either 'Conceptual/Case Study' or 'Modeling' or 'Empirical Study'\n"
        f"5. Contribution: Identify the primary contribution (theoretical, managerial, or methodological).\n"
        f"6. Future Potential and Limitations: Summarize future research directions or limitations.\n\n"
        f"Paper content:\n{text}\n\n"
        f"Respond with the answers formatted in the following structure:\n"
        f"- Context:\n- Research Question and Findings:\n- Theme of Research:\n- Method:\n"
        f"- Contribution:\n- Future Potential and Limitations:\n"
    )
    output = pipe(prompt, max_new_tokens=512)
    
    # Extract structured text from model output
    summary_text = output[0]["generated_text"]

    # Split the text into structured sections
    sections = {
        "Context": "",
        "Research Question and Findings": "",
        "Theme of Research": "",
        "Method": "",
        "Contribution": "",
        "Future Potential and Limitations": ""
    }

    # Regular expression to match each section header and text that follows
    for section in sections.keys():
        match = re.search(rf"- {section}:(.*?)(?=- [A-Z]|$)", summary_text, re.DOTALL)
        if match:
            sections[section] = match.group(1).strip()

    # Return the extracted sections
    return sections

# Process each PDF and summarize
def process_all_papers(pdf_directory, reference):
    paper_summaries = []
    
    for paper_index in range(1, 33):
        pdf_file_path = os.path.join(pdf_directory, f"{paper_index}.pdf")
        
        if os.path.exists(pdf_file_path):
            # Extract text from the PDF using pdfplumber
            text = extract_text_from_pdf(pdf_file_path)
            
            # Get structured summary of the paper
            summary = structure_summary_output(text)
            
            # Find citation (assuming the file name contains the citation or we can get it from the reference dataframe)
            citation = reference.loc[reference['Index'] == paper_index, 'Citation'].values[0]
            
            # Store the information in the dictionary
            paper_summary = {
                "ID": paper_index,
                "Citation": citation,
                "Context": summary["Context"],
                "Research Question and Findings": summary["Research Question and Findings"],
                "Theme of Research": summary["Theme of Research"],
                "Method": summary["Method"],
                "Contribution": summary["Contribution"],
                "Future Potential and Limitations": summary["Future Potential and Limitations"]
            }
            
            paper_summaries.append(paper_summary)
    
    return paper_summaries



def interpret_search_criteria(user_input):
    """
    Determines search criteria based on user input text.
    """
    theme = ""
    method = ""
    
    if "human vs ai" in user_input.lower():
        theme = "Human vs. AI"
    elif "human + ai" in user_input.lower() or "collaboration" in user_input.lower():
        theme = "Human + AI Collaboration"
    
    if "empirical" in user_input.lower():
        method = "Empirical Study"
    elif "conceptual" in user_input.lower() or "case study" in user_input.lower():
        method = "Conceptual/Case Study"
    elif "modeling" in user_input.lower():
        method = "Modeling"

    return {"Theme": theme, "Method": method}


def search_and_summarize_with_llm(paper_summaries, user_input):
    """
    Retrieves relevant articles based on interpreted search criteria and generates a cohesive summary.
    """
    global pipe
    # Interpret the search criteria from the user input
    search_criteria = interpret_search_criteria(user_input)
    
    # Collect all relevant summaries and citations
    relevant_summaries = []
    citation_list = []
    
    for summary in paper_summaries:
        # paper_id, citation, context, rqf, theme, method, contribution, future = summary["ID"], 
        
        # Check if this summary matches all search criteria
        if search_criteria['Theme'].lower() in summary["Theme of Research"].lower() and search_criteria['Method'].lower() in summary["Method"].lower():
            # Append the full information of this paper summary
            relevant_summaries.append(
            f"Paper ID: {summary['ID']}\nCitation: {summary['Citation']}\nContext: {summary['Context']}\n"
            f"Research Question and Findings: {summary['Research Question and Findings']}\nTheme of Research: {summary['Theme of Research']}\n"
            f"Method: {summary['Method']}\nContribution: {summary['Contribution']}\n"
            f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n"
            )

            citation_list.append(summary["Citation"])
    
    # Generate a cohesive summary using the LLM
    combined_text = "\n".join(relevant_summaries)

    prompt = (
        f"Based on the following research summaries related to {search_criteria['Theme']} and {search_criteria['Method']},"
        f"provide a cohesive summary discussing connections, common themes, trends, and future directions:\n\n{combined_text}"
    )
    summary_output = pipe(prompt, max_new_tokens=512)
    cohesive_summary = summary_output[0]["generated_text"]
    
    # Format the citation list
    formatted_citations = "Citations:\n" + "\n".join(f"- {citation}" for citation in citation_list)
    
    return cohesive_summary, formatted_citations