Spaces:
Runtime error
Runtime error
File size: 7,256 Bytes
463d8cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import os
import pdfplumber
import pandas as pd
import re
from transformers import pipeline
import pandas as pd
# Load the Hugging Face token from the environment
token = os.getenv("HUGGINGFACE_TOKEN")
# Initialize the pipeline with the token
pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf", use_auth_token=token, device_map="auto")
# Function to extract text from PDF using pdfplumber
def extract_text_from_pdf(pdf_file_path):
text = ""
with pdfplumber.open(pdf_file_path) as pdf:
for page in pdf.pages:
text += page.extract_text() # Extract text from each page
return text
# Define the function to structure the model output into required fields
def structure_summary_output(text):
global pipe
prompt = (
f"Please summarize the following information from the academic paper:\n"
f"1. Context: Specify whether the study is focused on a specific industry, task or a broader, conceptual scope.\n"
f"2. Research Question and Findings: Identify the main research question and summarise the key findings.\n"
f"3. Theme of Research:\n"
f" - Human vs. AI: Highlight any comparative advantages between humans and AI.\n"
f" - Human + AI Collaboration: Indicate the type of collaboration discussed.\n"
f" Note that the output for this field can only be either 'Human vs. AI' or 'Human + AI Collaboration'\n"
f"4. Method: Classify the study method as one of the following:\n"
f" - Conceptual/Case Study\n"
f" - Modeling: Either Stylized Modeling or Operations Research (OR) Model\n"
f" - Empirical Study: Lab/Field Experiment or Secondary Data Analysis\n"
f" Note that the output for this field can only be either 'Conceptual/Case Study' or 'Modeling' or 'Empirical Study'\n"
f"5. Contribution: Identify the primary contribution (theoretical, managerial, or methodological).\n"
f"6. Future Potential and Limitations: Summarize future research directions or limitations.\n\n"
f"Paper content:\n{text}\n\n"
f"Respond with the answers formatted in the following structure:\n"
f"- Context:\n- Research Question and Findings:\n- Theme of Research:\n- Method:\n"
f"- Contribution:\n- Future Potential and Limitations:\n"
)
output = pipe(prompt, max_new_tokens=512)
# Extract structured text from model output
summary_text = output[0]["generated_text"]
# Split the text into structured sections
sections = {
"Context": "",
"Research Question and Findings": "",
"Theme of Research": "",
"Method": "",
"Contribution": "",
"Future Potential and Limitations": ""
}
# Regular expression to match each section header and text that follows
for section in sections.keys():
match = re.search(rf"- {section}:(.*?)(?=- [A-Z]|$)", summary_text, re.DOTALL)
if match:
sections[section] = match.group(1).strip()
# Return the extracted sections
return sections
# Process each PDF and summarize
def process_all_papers(pdf_directory, reference):
paper_summaries = []
for paper_index in range(1, 33):
pdf_file_path = os.path.join(pdf_directory, f"{paper_index}.pdf")
if os.path.exists(pdf_file_path):
# Extract text from the PDF using pdfplumber
text = extract_text_from_pdf(pdf_file_path)
# Get structured summary of the paper
summary = structure_summary_output(text)
# Find citation (assuming the file name contains the citation or we can get it from the reference dataframe)
citation = reference.loc[reference['Index'] == paper_index, 'Citation'].values[0]
# Store the information in the dictionary
paper_summary = {
"ID": paper_index,
"Citation": citation,
"Context": summary["Context"],
"Research Question and Findings": summary["Research Question and Findings"],
"Theme of Research": summary["Theme of Research"],
"Method": summary["Method"],
"Contribution": summary["Contribution"],
"Future Potential and Limitations": summary["Future Potential and Limitations"]
}
paper_summaries.append(paper_summary)
return paper_summaries
def interpret_search_criteria(user_input):
"""
Determines search criteria based on user input text.
"""
theme = ""
method = ""
if "human vs ai" in user_input.lower():
theme = "Human vs. AI"
elif "human + ai" in user_input.lower() or "collaboration" in user_input.lower():
theme = "Human + AI Collaboration"
if "empirical" in user_input.lower():
method = "Empirical Study"
elif "conceptual" in user_input.lower() or "case study" in user_input.lower():
method = "Conceptual/Case Study"
elif "modeling" in user_input.lower():
method = "Modeling"
return {"Theme": theme, "Method": method}
def search_and_summarize_with_llm(paper_summaries, user_input):
"""
Retrieves relevant articles based on interpreted search criteria and generates a cohesive summary.
"""
global pipe
# Interpret the search criteria from the user input
search_criteria = interpret_search_criteria(user_input)
# Collect all relevant summaries and citations
relevant_summaries = []
citation_list = []
for summary in paper_summaries:
# paper_id, citation, context, rqf, theme, method, contribution, future = summary["ID"],
# Check if this summary matches all search criteria
if search_criteria['Theme'].lower() in summary["Theme of Research"].lower() and search_criteria['Method'].lower() in summary["Method"].lower():
# Append the full information of this paper summary
relevant_summaries.append(
f"Paper ID: {summary['ID']}\nCitation: {summary['Citation']}\nContext: {summary['Context']}\n"
f"Research Question and Findings: {summary['Research Question and Findings']}\nTheme of Research: {summary['Theme of Research']}\n"
f"Method: {summary['Method']}\nContribution: {summary['Contribution']}\n"
f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n"
)
citation_list.append(summary["Citation"])
# Generate a cohesive summary using the LLM
combined_text = "\n".join(relevant_summaries)
prompt = (
f"Based on the following research summaries related to {search_criteria['Theme']} and {search_criteria['Method']},"
f"provide a cohesive summary discussing connections, common themes, trends, and future directions:\n\n{combined_text}"
)
summary_output = pipe(prompt, max_new_tokens=512)
cohesive_summary = summary_output[0]["generated_text"]
# Format the citation list
formatted_citations = "Citations:\n" + "\n".join(f"- {citation}" for citation in citation_list)
return cohesive_summary, formatted_citations
|