import os import pdfplumber import pandas as pd import re from transformers import pipeline import pandas as pd # Load the Hugging Face token from the environment token = os.getenv("HUGGINGFACE_TOKEN") # Initialize the pipeline with the token pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf", use_auth_token=token, device_map="auto") # Function to extract text from PDF using pdfplumber def extract_text_from_pdf(pdf_file_path): text = "" with pdfplumber.open(pdf_file_path) as pdf: for page in pdf.pages: text += page.extract_text() # Extract text from each page return text # Define the function to structure the model output into required fields def structure_summary_output(text): global pipe prompt = ( f"Please summarize the following information from the academic paper:\n" f"1. Context: Specify whether the study is focused on a specific industry, task or a broader, conceptual scope.\n" f"2. Research Question and Findings: Identify the main research question and summarise the key findings.\n" f"3. Theme of Research:\n" f" - Human vs. AI: Highlight any comparative advantages between humans and AI.\n" f" - Human + AI Collaboration: Indicate the type of collaboration discussed.\n" f" Note that the output for this field can only be either 'Human vs. AI' or 'Human + AI Collaboration'\n" f"4. Method: Classify the study method as one of the following:\n" f" - Conceptual/Case Study\n" f" - Modeling: Either Stylized Modeling or Operations Research (OR) Model\n" f" - Empirical Study: Lab/Field Experiment or Secondary Data Analysis\n" f" Note that the output for this field can only be either 'Conceptual/Case Study' or 'Modeling' or 'Empirical Study'\n" f"5. Contribution: Identify the primary contribution (theoretical, managerial, or methodological).\n" f"6. Future Potential and Limitations: Summarize future research directions or limitations.\n\n" f"Paper content:\n{text}\n\n" f"Respond with the answers formatted in the following structure:\n" f"- Context:\n- Research Question and Findings:\n- Theme of Research:\n- Method:\n" f"- Contribution:\n- Future Potential and Limitations:\n" ) output = pipe(prompt, max_new_tokens=512) # Extract structured text from model output summary_text = output[0]["generated_text"] # Split the text into structured sections sections = { "Context": "", "Research Question and Findings": "", "Theme of Research": "", "Method": "", "Contribution": "", "Future Potential and Limitations": "" } # Regular expression to match each section header and text that follows for section in sections.keys(): match = re.search(rf"- {section}:(.*?)(?=- [A-Z]|$)", summary_text, re.DOTALL) if match: sections[section] = match.group(1).strip() # Return the extracted sections return sections # Process each PDF and summarize def process_all_papers(pdf_directory, reference): paper_summaries = [] for paper_index in range(1, 33): pdf_file_path = os.path.join(pdf_directory, f"{paper_index}.pdf") if os.path.exists(pdf_file_path): # Extract text from the PDF using pdfplumber text = extract_text_from_pdf(pdf_file_path) # Get structured summary of the paper summary = structure_summary_output(text) # Find citation (assuming the file name contains the citation or we can get it from the reference dataframe) citation = reference.loc[reference['Index'] == paper_index, 'Citation'].values[0] # Store the information in the dictionary paper_summary = { "ID": paper_index, "Citation": citation, "Context": summary["Context"], "Research Question and Findings": summary["Research Question and Findings"], "Theme of Research": summary["Theme of Research"], "Method": summary["Method"], "Contribution": summary["Contribution"], "Future Potential and Limitations": summary["Future Potential and Limitations"] } paper_summaries.append(paper_summary) return paper_summaries def interpret_search_criteria(user_input): """ Determines search criteria based on user input text. """ theme = "" method = "" if "human vs ai" in user_input.lower(): theme = "Human vs. AI" elif "human + ai" in user_input.lower() or "collaboration" in user_input.lower(): theme = "Human + AI Collaboration" if "empirical" in user_input.lower(): method = "Empirical Study" elif "conceptual" in user_input.lower() or "case study" in user_input.lower(): method = "Conceptual/Case Study" elif "modeling" in user_input.lower(): method = "Modeling" return {"Theme": theme, "Method": method} def search_and_summarize_with_llm(paper_summaries, user_input): """ Retrieves relevant articles based on interpreted search criteria and generates a cohesive summary. """ global pipe # Interpret the search criteria from the user input search_criteria = interpret_search_criteria(user_input) # Collect all relevant summaries and citations relevant_summaries = [] citation_list = [] for summary in paper_summaries: # paper_id, citation, context, rqf, theme, method, contribution, future = summary["ID"], # Check if this summary matches all search criteria if search_criteria['Theme'].lower() in summary["Theme of Research"].lower() and search_criteria['Method'].lower() in summary["Method"].lower(): # Append the full information of this paper summary relevant_summaries.append( f"Paper ID: {summary['ID']}\nCitation: {summary['Citation']}\nContext: {summary['Context']}\n" f"Research Question and Findings: {summary['Research Question and Findings']}\nTheme of Research: {summary['Theme of Research']}\n" f"Method: {summary['Method']}\nContribution: {summary['Contribution']}\n" f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n" ) citation_list.append(summary["Citation"]) # Generate a cohesive summary using the LLM combined_text = "\n".join(relevant_summaries) prompt = ( f"Based on the following research summaries related to {search_criteria['Theme']} and {search_criteria['Method']}," f"provide a cohesive summary discussing connections, common themes, trends, and future directions:\n\n{combined_text}" ) summary_output = pipe(prompt, max_new_tokens=512) cohesive_summary = summary_output[0]["generated_text"] # Format the citation list formatted_citations = "Citations:\n" + "\n".join(f"- {citation}" for citation in citation_list) return cohesive_summary, formatted_citations