Spaces:

Jessie0201
/

GPTtool

Runtime error

App Files Files Community

GPTtool / utils.py

Jessie0201

Update utils.py

463d8cd verified about 1 year ago

raw

history blame contribute delete

7.26 kB

	import os
	import pdfplumber
	import pandas as pd
	import re
	from transformers import pipeline
	import pandas as pd


	# Load the Hugging Face token from the environment
	token = os.getenv("HUGGINGFACE_TOKEN")

	# Initialize the pipeline with the token
	pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf", use_auth_token=token, device_map="auto")


	# Function to extract text from PDF using pdfplumber
	def extract_text_from_pdf(pdf_file_path):
	text = ""
	with pdfplumber.open(pdf_file_path) as pdf:
	for page in pdf.pages:
	text += page.extract_text() # Extract text from each page
	return text

	# Define the function to structure the model output into required fields
	def structure_summary_output(text):
	global pipe
	prompt = (
	f"Please summarize the following information from the academic paper:\n"
	f"1. Context: Specify whether the study is focused on a specific industry, task or a broader, conceptual scope.\n"
	f"2. Research Question and Findings: Identify the main research question and summarise the key findings.\n"
	f"3. Theme of Research:\n"
	f" - Human vs. AI: Highlight any comparative advantages between humans and AI.\n"
	f" - Human + AI Collaboration: Indicate the type of collaboration discussed.\n"
	f" Note that the output for this field can only be either 'Human vs. AI' or 'Human + AI Collaboration'\n"
	f"4. Method: Classify the study method as one of the following:\n"
	f" - Conceptual/Case Study\n"
	f" - Modeling: Either Stylized Modeling or Operations Research (OR) Model\n"
	f" - Empirical Study: Lab/Field Experiment or Secondary Data Analysis\n"
	f" Note that the output for this field can only be either 'Conceptual/Case Study' or 'Modeling' or 'Empirical Study'\n"
	f"5. Contribution: Identify the primary contribution (theoretical, managerial, or methodological).\n"
	f"6. Future Potential and Limitations: Summarize future research directions or limitations.\n\n"
	f"Paper content:\n{text}\n\n"
	f"Respond with the answers formatted in the following structure:\n"
	f"- Context:\n- Research Question and Findings:\n- Theme of Research:\n- Method:\n"
	f"- Contribution:\n- Future Potential and Limitations:\n"
	)
	output = pipe(prompt, max_new_tokens=512)

	# Extract structured text from model output
	summary_text = output[0]["generated_text"]

	# Split the text into structured sections
	sections = {
	"Context": "",
	"Research Question and Findings": "",
	"Theme of Research": "",
	"Method": "",
	"Contribution": "",
	"Future Potential and Limitations": ""
	}

	# Regular expression to match each section header and text that follows
	for section in sections.keys():
	match = re.search(rf"- {section}:(.*?)(?=- [A-Z]\|$)", summary_text, re.DOTALL)
	if match:
	sections[section] = match.group(1).strip()

	# Return the extracted sections
	return sections

	# Process each PDF and summarize
	def process_all_papers(pdf_directory, reference):
	paper_summaries = []

	for paper_index in range(1, 33):
	pdf_file_path = os.path.join(pdf_directory, f"{paper_index}.pdf")

	if os.path.exists(pdf_file_path):
	# Extract text from the PDF using pdfplumber
	text = extract_text_from_pdf(pdf_file_path)

	# Get structured summary of the paper
	summary = structure_summary_output(text)

	# Find citation (assuming the file name contains the citation or we can get it from the reference dataframe)
	citation = reference.loc[reference['Index'] == paper_index, 'Citation'].values[0]

	# Store the information in the dictionary
	paper_summary = {
	"ID": paper_index,
	"Citation": citation,
	"Context": summary["Context"],
	"Research Question and Findings": summary["Research Question and Findings"],
	"Theme of Research": summary["Theme of Research"],
	"Method": summary["Method"],
	"Contribution": summary["Contribution"],
	"Future Potential and Limitations": summary["Future Potential and Limitations"]
	}

	paper_summaries.append(paper_summary)

	return paper_summaries



	def interpret_search_criteria(user_input):
	"""
	Determines search criteria based on user input text.
	"""
	theme = ""
	method = ""

	if "human vs ai" in user_input.lower():
	theme = "Human vs. AI"
	elif "human + ai" in user_input.lower() or "collaboration" in user_input.lower():
	theme = "Human + AI Collaboration"

	if "empirical" in user_input.lower():
	method = "Empirical Study"
	elif "conceptual" in user_input.lower() or "case study" in user_input.lower():
	method = "Conceptual/Case Study"
	elif "modeling" in user_input.lower():
	method = "Modeling"

	return {"Theme": theme, "Method": method}


	def search_and_summarize_with_llm(paper_summaries, user_input):
	"""
	Retrieves relevant articles based on interpreted search criteria and generates a cohesive summary.
	"""
	global pipe
	# Interpret the search criteria from the user input
	search_criteria = interpret_search_criteria(user_input)

	# Collect all relevant summaries and citations
	relevant_summaries = []
	citation_list = []

	for summary in paper_summaries:
	# paper_id, citation, context, rqf, theme, method, contribution, future = summary["ID"],

	# Check if this summary matches all search criteria
	if search_criteria['Theme'].lower() in summary["Theme of Research"].lower() and search_criteria['Method'].lower() in summary["Method"].lower():
	# Append the full information of this paper summary
	relevant_summaries.append(
	f"Paper ID: {summary['ID']}\nCitation: {summary['Citation']}\nContext: {summary['Context']}\n"
	f"Research Question and Findings: {summary['Research Question and Findings']}\nTheme of Research: {summary['Theme of Research']}\n"
	f"Method: {summary['Method']}\nContribution: {summary['Contribution']}\n"
	f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n"
	)

	citation_list.append(summary["Citation"])

	# Generate a cohesive summary using the LLM
	combined_text = "\n".join(relevant_summaries)

	prompt = (
	f"Based on the following research summaries related to {search_criteria['Theme']} and {search_criteria['Method']},"
	f"provide a cohesive summary discussing connections, common themes, trends, and future directions:\n\n{combined_text}"
	)
	summary_output = pipe(prompt, max_new_tokens=512)
	cohesive_summary = summary_output[0]["generated_text"]

	# Format the citation list
	formatted_citations = "Citations:\n" + "\n".join(f"- {citation}" for citation in citation_list)

	return cohesive_summary, formatted_citations