Jessie0201 commited on
Commit
27b16dd
·
verified ·
1 Parent(s): e469069

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +82 -0
  2. utils.py +160 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import *
2
+ import gradio as gr
3
+ from dotenv import load_dotenv
4
+ import os
5
+
6
+ load_dotenv()
7
+
8
+ # Define file paths
9
+ reference_file_path = "./data/reference.xlsx"
10
+ pdf_directory = "./data/pdf/"
11
+
12
+ # Check if reference file exists
13
+ if not os.path.exists(reference_file_path):
14
+ raise FileNotFoundError("Reference file not found. Please ensure 'data/reference.xlsx' exists in the workspace.")
15
+ # Load the reference data from Excel
16
+ reference = pd.read_excel(reference_file_path)
17
+
18
+ paper_summaries = process_all_papers(pdf_directory, reference=reference)
19
+
20
+ # Define the Gradio function to process and display summaries
21
+ def display_summaries():
22
+ global paper_summaries
23
+ # Format the summaries for display in Gradio
24
+ formatted_summary = ""
25
+ for summary in paper_summaries:
26
+ formatted_summary += (
27
+ f"Paper ID: {summary['ID']}\n"
28
+ f"Citation: {summary['Citation']}\n"
29
+ f"Context: {summary['Context']}\n"
30
+ f"Research Question and Findings: {summary['Research Question and Findings']}\n"
31
+ f"Theme of Research: {summary['Theme of Research']}\n"
32
+ f"Method: {summary['Method']}\n"
33
+ f"Contribution: {summary['Contribution']}\n"
34
+ f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n"
35
+ "------------------------------------------\n\n"
36
+ )
37
+
38
+ return formatted_summary
39
+
40
+
41
+ # Gradio function to get user input and display summaries based on criteria
42
+ def retrieve_and_display_search_results(user_input):
43
+ global paper_summaries
44
+ # Call the search and summarize function
45
+ cohesive_summary, formatted_citations = search_and_summarize_with_llm(paper_summaries, user_input)
46
+
47
+ # Return combined summary and citations
48
+ return cohesive_summary + "\n\n" + formatted_citations
49
+
50
+
51
+ # Create Gradio interface
52
+ with gr.Blocks() as demo:
53
+ gr.Markdown("# Academic Paper Summarization Tool")
54
+ gr.Markdown("Click 'Begin Summarization' to process and summarize the 32 papers.")
55
+
56
+ summary_output = gr.Textbox(
57
+ label="Summarization Output",
58
+ placeholder="Summaries will appear here after processing...",
59
+ lines=30,
60
+ interactive=False
61
+ )
62
+
63
+ begin_button = gr.Button("Begin Summarization")
64
+ begin_button.click(fn=display_summaries, inputs=None, outputs=summary_output)
65
+
66
+
67
+ gr.Markdown("# Research Summarization Tool")
68
+ gr.Markdown("Type your search criteria below (e.g., 'I want all research about human VS AI and empirical research')")
69
+
70
+ user_input = gr.Textbox(label="Search Criteria", placeholder="Enter your search criteria here...")
71
+ search_button = gr.Button("Search Relevant Articles")
72
+ search_output = gr.Textbox(
73
+ label="Search Results",
74
+ placeholder="Results of search will appear here...",
75
+ lines=30,
76
+ interactive=False
77
+ )
78
+
79
+ search_button.click(fn=retrieve_and_display_search_results, inputs=user_input, outputs=search_output)
80
+
81
+ # Run the app
82
+ demo.launch()
utils.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pdfplumber
3
+ import pandas as pd
4
+ import re
5
+ from transformers import pipeline
6
+ import pandas as pd
7
+
8
+
9
+ # Function to extract text from PDF using pdfplumber
10
+ def extract_text_from_pdf(pdf_file_path):
11
+ text = ""
12
+ with pdfplumber.open(pdf_file_path) as pdf:
13
+ for page in pdf.pages:
14
+ text += page.extract_text() # Extract text from each page
15
+ return text
16
+
17
+ # Define the function to structure the model output into required fields
18
+ def structure_summary_output(text):
19
+ pipe = pipeline("text-generation", model="meta-llama/Llama-2-70b-hf", use_auth_token=os.getenv("HUGGINGFACE_TOKEN"))
20
+ prompt = (
21
+ f"Please summarize the following information from the academic paper:\n"
22
+ f"1. Context: Specify whether the study is focused on a specific industry, task or a broader, conceptual scope.\n"
23
+ f"2. Research Question and Findings: Identify the main research question and summarise the key findings.\n"
24
+ f"3. Theme of Research:\n"
25
+ f" - Human vs. AI: Highlight any comparative advantages between humans and AI.\n"
26
+ f" - Human + AI Collaboration: Indicate the type of collaboration discussed.\n"
27
+ f" Note that the output for this field can only be either 'Human vs. AI' or 'Human + AI Collaboration'\n"
28
+ f"4. Method: Classify the study method as one of the following:\n"
29
+ f" - Conceptual/Case Study\n"
30
+ f" - Modeling: Either Stylized Modeling or Operations Research (OR) Model\n"
31
+ f" - Empirical Study: Lab/Field Experiment or Secondary Data Analysis\n"
32
+ f" Note that the output for this field can only be either 'Conceptual/Case Study' or 'Modeling' or 'Empirical Study'\n"
33
+ f"5. Contribution: Identify the primary contribution (theoretical, managerial, or methodological).\n"
34
+ f"6. Future Potential and Limitations: Summarize future research directions or limitations.\n\n"
35
+ f"Paper content:\n{text}\n\n"
36
+ f"Respond with the answers formatted in the following structure:\n"
37
+ f"- Context:\n- Research Question and Findings:\n- Theme of Research:\n- Method:\n"
38
+ f"- Contribution:\n- Future Potential and Limitations:\n"
39
+ )
40
+ output = pipe(prompt, max_new_tokens=512)
41
+
42
+ # Extract structured text from model output
43
+ summary_text = output[0]["generated_text"]
44
+
45
+ # Split the text into structured sections
46
+ sections = {
47
+ "Context": "",
48
+ "Research Question and Findings": "",
49
+ "Theme of Research": "",
50
+ "Method": "",
51
+ "Contribution": "",
52
+ "Future Potential and Limitations": ""
53
+ }
54
+
55
+ # Regular expression to match each section header and text that follows
56
+ for section in sections.keys():
57
+ match = re.search(rf"- {section}:(.*?)(?=- [A-Z]|$)", summary_text, re.DOTALL)
58
+ if match:
59
+ sections[section] = match.group(1).strip()
60
+
61
+ # Return the extracted sections
62
+ return sections
63
+
64
+ # Process each PDF and summarize
65
+ def process_all_papers(pdf_directory, reference):
66
+ paper_summaries = []
67
+
68
+ for paper_index in range(1, 33):
69
+ pdf_file_path = os.path.join(pdf_directory, f"{paper_index}.pdf")
70
+
71
+ if os.path.exists(pdf_file_path):
72
+ # Extract text from the PDF using pdfplumber
73
+ text = extract_text_from_pdf(pdf_file_path)
74
+
75
+ # Get structured summary of the paper
76
+ summary = structure_summary_output(text)
77
+
78
+ # Find citation (assuming the file name contains the citation or we can get it from the reference dataframe)
79
+ citation = reference.loc[reference['Index'] == paper_index, 'Citation'].values[0]
80
+
81
+ # Store the information in the dictionary
82
+ paper_summary = {
83
+ "ID": paper_index,
84
+ "Citation": citation,
85
+ "Context": summary["Context"],
86
+ "Research Question and Findings": summary["Research Question and Findings"],
87
+ "Theme of Research": summary["Theme of Research"],
88
+ "Method": summary["Method"],
89
+ "Contribution": summary["Contribution"],
90
+ "Future Potential and Limitations": summary["Future Potential and Limitations"]
91
+ }
92
+
93
+ paper_summaries.append(paper_summary)
94
+
95
+ return paper_summaries
96
+
97
+
98
+
99
+ def interpret_search_criteria(user_input):
100
+ """
101
+ Determines search criteria based on user input text.
102
+ """
103
+ theme = ""
104
+ method = ""
105
+
106
+ if "human vs ai" in user_input.lower():
107
+ theme = "Human vs. AI"
108
+ elif "human + ai" in user_input.lower() or "collaboration" in user_input.lower():
109
+ theme = "Human + AI Collaboration"
110
+
111
+ if "empirical" in user_input.lower():
112
+ method = "Empirical Study"
113
+ elif "conceptual" in user_input.lower() or "case study" in user_input.lower():
114
+ method = "Conceptual/Case Study"
115
+ elif "modeling" in user_input.lower():
116
+ method = "Modeling"
117
+
118
+ return {"Theme": theme, "Method": method}
119
+
120
+
121
+ def search_and_summarize_with_llm(paper_summaries, user_input):
122
+ """
123
+ Retrieves relevant articles based on interpreted search criteria and generates a cohesive summary.
124
+ """
125
+ # Interpret the search criteria from the user input
126
+ search_criteria = interpret_search_criteria(user_input)
127
+
128
+ # Collect all relevant summaries and citations
129
+ relevant_summaries = []
130
+ citation_list = []
131
+
132
+ for summary in paper_summaries:
133
+ # paper_id, citation, context, rqf, theme, method, contribution, future = summary["ID"],
134
+
135
+ # Check if this summary matches all search criteria
136
+ if search_criteria['Theme'].lower() in summary["Theme of Research"].lower() and search_criteria['Method'].lower() in summary["Method"].lower():
137
+ # Append the full information of this paper summary
138
+ relevant_summaries.append(
139
+ f"Paper ID: {summary['ID']}\nCitation: {summary['Citation']}\nContext: {summary['Context']}\n"
140
+ f"Research Question and Findings: {summary['Research Question and Findings']}\nTheme of Research: {summary['Theme of Research']}\n"
141
+ f"Method: {summary['Method']}\nContribution: {summary['Contribution']}\n"
142
+ f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n"
143
+ )
144
+
145
+ citation_list.append(summary["Citation"])
146
+
147
+ # Generate a cohesive summary using the LLM
148
+ combined_text = "\n".join(relevant_summaries)
149
+ pipe = pipe = pipeline("text-generation", model="meta-llama/Llama-2-70b-hf", use_auth_token=os.getenv("HUGGINGFACE_TOKEN"))
150
+ prompt = (
151
+ f"Based on the following research summaries related to {search_criteria['Theme']} and {search_criteria['Method']},"
152
+ f"provide a cohesive summary discussing connections, common themes, trends, and future directions:\n\n{combined_text}"
153
+ )
154
+ summary_output = pipe(prompt, max_new_tokens=512)
155
+ cohesive_summary = summary_output[0]["generated_text"]
156
+
157
+ # Format the citation list
158
+ formatted_citations = "Citations:\n" + "\n".join(f"- {citation}" for citation in citation_list)
159
+
160
+ return cohesive_summary, formatted_citations