GPTtool / utils.py
Jessie0201's picture
Update utils.py
463d8cd verified
import os
import pdfplumber
import pandas as pd
import re
from transformers import pipeline
import pandas as pd
# Load the Hugging Face token from the environment
token = os.getenv("HUGGINGFACE_TOKEN")
# Initialize the pipeline with the token
pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf", use_auth_token=token, device_map="auto")
# Function to extract text from PDF using pdfplumber
def extract_text_from_pdf(pdf_file_path):
text = ""
with pdfplumber.open(pdf_file_path) as pdf:
for page in pdf.pages:
text += page.extract_text() # Extract text from each page
return text
# Define the function to structure the model output into required fields
def structure_summary_output(text):
global pipe
prompt = (
f"Please summarize the following information from the academic paper:\n"
f"1. Context: Specify whether the study is focused on a specific industry, task or a broader, conceptual scope.\n"
f"2. Research Question and Findings: Identify the main research question and summarise the key findings.\n"
f"3. Theme of Research:\n"
f" - Human vs. AI: Highlight any comparative advantages between humans and AI.\n"
f" - Human + AI Collaboration: Indicate the type of collaboration discussed.\n"
f" Note that the output for this field can only be either 'Human vs. AI' or 'Human + AI Collaboration'\n"
f"4. Method: Classify the study method as one of the following:\n"
f" - Conceptual/Case Study\n"
f" - Modeling: Either Stylized Modeling or Operations Research (OR) Model\n"
f" - Empirical Study: Lab/Field Experiment or Secondary Data Analysis\n"
f" Note that the output for this field can only be either 'Conceptual/Case Study' or 'Modeling' or 'Empirical Study'\n"
f"5. Contribution: Identify the primary contribution (theoretical, managerial, or methodological).\n"
f"6. Future Potential and Limitations: Summarize future research directions or limitations.\n\n"
f"Paper content:\n{text}\n\n"
f"Respond with the answers formatted in the following structure:\n"
f"- Context:\n- Research Question and Findings:\n- Theme of Research:\n- Method:\n"
f"- Contribution:\n- Future Potential and Limitations:\n"
)
output = pipe(prompt, max_new_tokens=512)
# Extract structured text from model output
summary_text = output[0]["generated_text"]
# Split the text into structured sections
sections = {
"Context": "",
"Research Question and Findings": "",
"Theme of Research": "",
"Method": "",
"Contribution": "",
"Future Potential and Limitations": ""
}
# Regular expression to match each section header and text that follows
for section in sections.keys():
match = re.search(rf"- {section}:(.*?)(?=- [A-Z]|$)", summary_text, re.DOTALL)
if match:
sections[section] = match.group(1).strip()
# Return the extracted sections
return sections
# Process each PDF and summarize
def process_all_papers(pdf_directory, reference):
paper_summaries = []
for paper_index in range(1, 33):
pdf_file_path = os.path.join(pdf_directory, f"{paper_index}.pdf")
if os.path.exists(pdf_file_path):
# Extract text from the PDF using pdfplumber
text = extract_text_from_pdf(pdf_file_path)
# Get structured summary of the paper
summary = structure_summary_output(text)
# Find citation (assuming the file name contains the citation or we can get it from the reference dataframe)
citation = reference.loc[reference['Index'] == paper_index, 'Citation'].values[0]
# Store the information in the dictionary
paper_summary = {
"ID": paper_index,
"Citation": citation,
"Context": summary["Context"],
"Research Question and Findings": summary["Research Question and Findings"],
"Theme of Research": summary["Theme of Research"],
"Method": summary["Method"],
"Contribution": summary["Contribution"],
"Future Potential and Limitations": summary["Future Potential and Limitations"]
}
paper_summaries.append(paper_summary)
return paper_summaries
def interpret_search_criteria(user_input):
"""
Determines search criteria based on user input text.
"""
theme = ""
method = ""
if "human vs ai" in user_input.lower():
theme = "Human vs. AI"
elif "human + ai" in user_input.lower() or "collaboration" in user_input.lower():
theme = "Human + AI Collaboration"
if "empirical" in user_input.lower():
method = "Empirical Study"
elif "conceptual" in user_input.lower() or "case study" in user_input.lower():
method = "Conceptual/Case Study"
elif "modeling" in user_input.lower():
method = "Modeling"
return {"Theme": theme, "Method": method}
def search_and_summarize_with_llm(paper_summaries, user_input):
"""
Retrieves relevant articles based on interpreted search criteria and generates a cohesive summary.
"""
global pipe
# Interpret the search criteria from the user input
search_criteria = interpret_search_criteria(user_input)
# Collect all relevant summaries and citations
relevant_summaries = []
citation_list = []
for summary in paper_summaries:
# paper_id, citation, context, rqf, theme, method, contribution, future = summary["ID"],
# Check if this summary matches all search criteria
if search_criteria['Theme'].lower() in summary["Theme of Research"].lower() and search_criteria['Method'].lower() in summary["Method"].lower():
# Append the full information of this paper summary
relevant_summaries.append(
f"Paper ID: {summary['ID']}\nCitation: {summary['Citation']}\nContext: {summary['Context']}\n"
f"Research Question and Findings: {summary['Research Question and Findings']}\nTheme of Research: {summary['Theme of Research']}\n"
f"Method: {summary['Method']}\nContribution: {summary['Contribution']}\n"
f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n"
)
citation_list.append(summary["Citation"])
# Generate a cohesive summary using the LLM
combined_text = "\n".join(relevant_summaries)
prompt = (
f"Based on the following research summaries related to {search_criteria['Theme']} and {search_criteria['Method']},"
f"provide a cohesive summary discussing connections, common themes, trends, and future directions:\n\n{combined_text}"
)
summary_output = pipe(prompt, max_new_tokens=512)
cohesive_summary = summary_output[0]["generated_text"]
# Format the citation list
formatted_citations = "Citations:\n" + "\n".join(f"- {citation}" for citation in citation_list)
return cohesive_summary, formatted_citations