Jessie0201 commited on
Commit
463d8cd
·
verified ·
1 Parent(s): 2d10a3e

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +168 -168
utils.py CHANGED
@@ -1,168 +1,168 @@
1
- import os
2
- import pdfplumber
3
- import pandas as pd
4
- import re
5
- from transformers import pipeline
6
- import pandas as pd
7
-
8
-
9
- # Load the Hugging Face token from the environment
10
- token = os.getenv("HUGGINGFACE_TOKEN")
11
-
12
- # Initialize the pipeline with the token
13
- pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf", use_auth_token=token)
14
-
15
-
16
- # Function to extract text from PDF using pdfplumber
17
- def extract_text_from_pdf(pdf_file_path):
18
- text = ""
19
- with pdfplumber.open(pdf_file_path) as pdf:
20
- for page in pdf.pages:
21
- text += page.extract_text() # Extract text from each page
22
- return text
23
-
24
- # Define the function to structure the model output into required fields
25
- def structure_summary_output(text):
26
- global pipe
27
- prompt = (
28
- f"Please summarize the following information from the academic paper:\n"
29
- f"1. Context: Specify whether the study is focused on a specific industry, task or a broader, conceptual scope.\n"
30
- f"2. Research Question and Findings: Identify the main research question and summarise the key findings.\n"
31
- f"3. Theme of Research:\n"
32
- f" - Human vs. AI: Highlight any comparative advantages between humans and AI.\n"
33
- f" - Human + AI Collaboration: Indicate the type of collaboration discussed.\n"
34
- f" Note that the output for this field can only be either 'Human vs. AI' or 'Human + AI Collaboration'\n"
35
- f"4. Method: Classify the study method as one of the following:\n"
36
- f" - Conceptual/Case Study\n"
37
- f" - Modeling: Either Stylized Modeling or Operations Research (OR) Model\n"
38
- f" - Empirical Study: Lab/Field Experiment or Secondary Data Analysis\n"
39
- f" Note that the output for this field can only be either 'Conceptual/Case Study' or 'Modeling' or 'Empirical Study'\n"
40
- f"5. Contribution: Identify the primary contribution (theoretical, managerial, or methodological).\n"
41
- f"6. Future Potential and Limitations: Summarize future research directions or limitations.\n\n"
42
- f"Paper content:\n{text}\n\n"
43
- f"Respond with the answers formatted in the following structure:\n"
44
- f"- Context:\n- Research Question and Findings:\n- Theme of Research:\n- Method:\n"
45
- f"- Contribution:\n- Future Potential and Limitations:\n"
46
- )
47
- output = pipe(prompt, max_new_tokens=512)
48
-
49
- # Extract structured text from model output
50
- summary_text = output[0]["generated_text"]
51
-
52
- # Split the text into structured sections
53
- sections = {
54
- "Context": "",
55
- "Research Question and Findings": "",
56
- "Theme of Research": "",
57
- "Method": "",
58
- "Contribution": "",
59
- "Future Potential and Limitations": ""
60
- }
61
-
62
- # Regular expression to match each section header and text that follows
63
- for section in sections.keys():
64
- match = re.search(rf"- {section}:(.*?)(?=- [A-Z]|$)", summary_text, re.DOTALL)
65
- if match:
66
- sections[section] = match.group(1).strip()
67
-
68
- # Return the extracted sections
69
- return sections
70
-
71
- # Process each PDF and summarize
72
- def process_all_papers(pdf_directory, reference):
73
- paper_summaries = []
74
-
75
- for paper_index in range(1, 33):
76
- pdf_file_path = os.path.join(pdf_directory, f"{paper_index}.pdf")
77
-
78
- if os.path.exists(pdf_file_path):
79
- # Extract text from the PDF using pdfplumber
80
- text = extract_text_from_pdf(pdf_file_path)
81
-
82
- # Get structured summary of the paper
83
- summary = structure_summary_output(text)
84
-
85
- # Find citation (assuming the file name contains the citation or we can get it from the reference dataframe)
86
- citation = reference.loc[reference['Index'] == paper_index, 'Citation'].values[0]
87
-
88
- # Store the information in the dictionary
89
- paper_summary = {
90
- "ID": paper_index,
91
- "Citation": citation,
92
- "Context": summary["Context"],
93
- "Research Question and Findings": summary["Research Question and Findings"],
94
- "Theme of Research": summary["Theme of Research"],
95
- "Method": summary["Method"],
96
- "Contribution": summary["Contribution"],
97
- "Future Potential and Limitations": summary["Future Potential and Limitations"]
98
- }
99
-
100
- paper_summaries.append(paper_summary)
101
-
102
- return paper_summaries
103
-
104
-
105
-
106
- def interpret_search_criteria(user_input):
107
- """
108
- Determines search criteria based on user input text.
109
- """
110
- theme = ""
111
- method = ""
112
-
113
- if "human vs ai" in user_input.lower():
114
- theme = "Human vs. AI"
115
- elif "human + ai" in user_input.lower() or "collaboration" in user_input.lower():
116
- theme = "Human + AI Collaboration"
117
-
118
- if "empirical" in user_input.lower():
119
- method = "Empirical Study"
120
- elif "conceptual" in user_input.lower() or "case study" in user_input.lower():
121
- method = "Conceptual/Case Study"
122
- elif "modeling" in user_input.lower():
123
- method = "Modeling"
124
-
125
- return {"Theme": theme, "Method": method}
126
-
127
-
128
- def search_and_summarize_with_llm(paper_summaries, user_input):
129
- """
130
- Retrieves relevant articles based on interpreted search criteria and generates a cohesive summary.
131
- """
132
- global pipe
133
- # Interpret the search criteria from the user input
134
- search_criteria = interpret_search_criteria(user_input)
135
-
136
- # Collect all relevant summaries and citations
137
- relevant_summaries = []
138
- citation_list = []
139
-
140
- for summary in paper_summaries:
141
- # paper_id, citation, context, rqf, theme, method, contribution, future = summary["ID"],
142
-
143
- # Check if this summary matches all search criteria
144
- if search_criteria['Theme'].lower() in summary["Theme of Research"].lower() and search_criteria['Method'].lower() in summary["Method"].lower():
145
- # Append the full information of this paper summary
146
- relevant_summaries.append(
147
- f"Paper ID: {summary['ID']}\nCitation: {summary['Citation']}\nContext: {summary['Context']}\n"
148
- f"Research Question and Findings: {summary['Research Question and Findings']}\nTheme of Research: {summary['Theme of Research']}\n"
149
- f"Method: {summary['Method']}\nContribution: {summary['Contribution']}\n"
150
- f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n"
151
- )
152
-
153
- citation_list.append(summary["Citation"])
154
-
155
- # Generate a cohesive summary using the LLM
156
- combined_text = "\n".join(relevant_summaries)
157
-
158
- prompt = (
159
- f"Based on the following research summaries related to {search_criteria['Theme']} and {search_criteria['Method']},"
160
- f"provide a cohesive summary discussing connections, common themes, trends, and future directions:\n\n{combined_text}"
161
- )
162
- summary_output = pipe(prompt, max_new_tokens=512)
163
- cohesive_summary = summary_output[0]["generated_text"]
164
-
165
- # Format the citation list
166
- formatted_citations = "Citations:\n" + "\n".join(f"- {citation}" for citation in citation_list)
167
-
168
- return cohesive_summary, formatted_citations
 
1
+ import os
2
+ import pdfplumber
3
+ import pandas as pd
4
+ import re
5
+ from transformers import pipeline
6
+ import pandas as pd
7
+
8
+
9
+ # Load the Hugging Face token from the environment
10
+ token = os.getenv("HUGGINGFACE_TOKEN")
11
+
12
+ # Initialize the pipeline with the token
13
+ pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf", use_auth_token=token, device_map="auto")
14
+
15
+
16
+ # Function to extract text from PDF using pdfplumber
17
+ def extract_text_from_pdf(pdf_file_path):
18
+ text = ""
19
+ with pdfplumber.open(pdf_file_path) as pdf:
20
+ for page in pdf.pages:
21
+ text += page.extract_text() # Extract text from each page
22
+ return text
23
+
24
+ # Define the function to structure the model output into required fields
25
+ def structure_summary_output(text):
26
+ global pipe
27
+ prompt = (
28
+ f"Please summarize the following information from the academic paper:\n"
29
+ f"1. Context: Specify whether the study is focused on a specific industry, task or a broader, conceptual scope.\n"
30
+ f"2. Research Question and Findings: Identify the main research question and summarise the key findings.\n"
31
+ f"3. Theme of Research:\n"
32
+ f" - Human vs. AI: Highlight any comparative advantages between humans and AI.\n"
33
+ f" - Human + AI Collaboration: Indicate the type of collaboration discussed.\n"
34
+ f" Note that the output for this field can only be either 'Human vs. AI' or 'Human + AI Collaboration'\n"
35
+ f"4. Method: Classify the study method as one of the following:\n"
36
+ f" - Conceptual/Case Study\n"
37
+ f" - Modeling: Either Stylized Modeling or Operations Research (OR) Model\n"
38
+ f" - Empirical Study: Lab/Field Experiment or Secondary Data Analysis\n"
39
+ f" Note that the output for this field can only be either 'Conceptual/Case Study' or 'Modeling' or 'Empirical Study'\n"
40
+ f"5. Contribution: Identify the primary contribution (theoretical, managerial, or methodological).\n"
41
+ f"6. Future Potential and Limitations: Summarize future research directions or limitations.\n\n"
42
+ f"Paper content:\n{text}\n\n"
43
+ f"Respond with the answers formatted in the following structure:\n"
44
+ f"- Context:\n- Research Question and Findings:\n- Theme of Research:\n- Method:\n"
45
+ f"- Contribution:\n- Future Potential and Limitations:\n"
46
+ )
47
+ output = pipe(prompt, max_new_tokens=512)
48
+
49
+ # Extract structured text from model output
50
+ summary_text = output[0]["generated_text"]
51
+
52
+ # Split the text into structured sections
53
+ sections = {
54
+ "Context": "",
55
+ "Research Question and Findings": "",
56
+ "Theme of Research": "",
57
+ "Method": "",
58
+ "Contribution": "",
59
+ "Future Potential and Limitations": ""
60
+ }
61
+
62
+ # Regular expression to match each section header and text that follows
63
+ for section in sections.keys():
64
+ match = re.search(rf"- {section}:(.*?)(?=- [A-Z]|$)", summary_text, re.DOTALL)
65
+ if match:
66
+ sections[section] = match.group(1).strip()
67
+
68
+ # Return the extracted sections
69
+ return sections
70
+
71
+ # Process each PDF and summarize
72
+ def process_all_papers(pdf_directory, reference):
73
+ paper_summaries = []
74
+
75
+ for paper_index in range(1, 33):
76
+ pdf_file_path = os.path.join(pdf_directory, f"{paper_index}.pdf")
77
+
78
+ if os.path.exists(pdf_file_path):
79
+ # Extract text from the PDF using pdfplumber
80
+ text = extract_text_from_pdf(pdf_file_path)
81
+
82
+ # Get structured summary of the paper
83
+ summary = structure_summary_output(text)
84
+
85
+ # Find citation (assuming the file name contains the citation or we can get it from the reference dataframe)
86
+ citation = reference.loc[reference['Index'] == paper_index, 'Citation'].values[0]
87
+
88
+ # Store the information in the dictionary
89
+ paper_summary = {
90
+ "ID": paper_index,
91
+ "Citation": citation,
92
+ "Context": summary["Context"],
93
+ "Research Question and Findings": summary["Research Question and Findings"],
94
+ "Theme of Research": summary["Theme of Research"],
95
+ "Method": summary["Method"],
96
+ "Contribution": summary["Contribution"],
97
+ "Future Potential and Limitations": summary["Future Potential and Limitations"]
98
+ }
99
+
100
+ paper_summaries.append(paper_summary)
101
+
102
+ return paper_summaries
103
+
104
+
105
+
106
+ def interpret_search_criteria(user_input):
107
+ """
108
+ Determines search criteria based on user input text.
109
+ """
110
+ theme = ""
111
+ method = ""
112
+
113
+ if "human vs ai" in user_input.lower():
114
+ theme = "Human vs. AI"
115
+ elif "human + ai" in user_input.lower() or "collaboration" in user_input.lower():
116
+ theme = "Human + AI Collaboration"
117
+
118
+ if "empirical" in user_input.lower():
119
+ method = "Empirical Study"
120
+ elif "conceptual" in user_input.lower() or "case study" in user_input.lower():
121
+ method = "Conceptual/Case Study"
122
+ elif "modeling" in user_input.lower():
123
+ method = "Modeling"
124
+
125
+ return {"Theme": theme, "Method": method}
126
+
127
+
128
+ def search_and_summarize_with_llm(paper_summaries, user_input):
129
+ """
130
+ Retrieves relevant articles based on interpreted search criteria and generates a cohesive summary.
131
+ """
132
+ global pipe
133
+ # Interpret the search criteria from the user input
134
+ search_criteria = interpret_search_criteria(user_input)
135
+
136
+ # Collect all relevant summaries and citations
137
+ relevant_summaries = []
138
+ citation_list = []
139
+
140
+ for summary in paper_summaries:
141
+ # paper_id, citation, context, rqf, theme, method, contribution, future = summary["ID"],
142
+
143
+ # Check if this summary matches all search criteria
144
+ if search_criteria['Theme'].lower() in summary["Theme of Research"].lower() and search_criteria['Method'].lower() in summary["Method"].lower():
145
+ # Append the full information of this paper summary
146
+ relevant_summaries.append(
147
+ f"Paper ID: {summary['ID']}\nCitation: {summary['Citation']}\nContext: {summary['Context']}\n"
148
+ f"Research Question and Findings: {summary['Research Question and Findings']}\nTheme of Research: {summary['Theme of Research']}\n"
149
+ f"Method: {summary['Method']}\nContribution: {summary['Contribution']}\n"
150
+ f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n"
151
+ )
152
+
153
+ citation_list.append(summary["Citation"])
154
+
155
+ # Generate a cohesive summary using the LLM
156
+ combined_text = "\n".join(relevant_summaries)
157
+
158
+ prompt = (
159
+ f"Based on the following research summaries related to {search_criteria['Theme']} and {search_criteria['Method']},"
160
+ f"provide a cohesive summary discussing connections, common themes, trends, and future directions:\n\n{combined_text}"
161
+ )
162
+ summary_output = pipe(prompt, max_new_tokens=512)
163
+ cohesive_summary = summary_output[0]["generated_text"]
164
+
165
+ # Format the citation list
166
+ formatted_citations = "Citations:\n" + "\n".join(f"- {citation}" for citation in citation_list)
167
+
168
+ return cohesive_summary, formatted_citations