Spaces:
Runtime error
Runtime error
| import os | |
| import pdfplumber | |
| import pandas as pd | |
| import re | |
| from transformers import pipeline | |
| import pandas as pd | |
| # Load the Hugging Face token from the environment | |
| token = os.getenv("HUGGINGFACE_TOKEN") | |
| # Initialize the pipeline with the token | |
| pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf", use_auth_token=token, device_map="auto") | |
| # Function to extract text from PDF using pdfplumber | |
| def extract_text_from_pdf(pdf_file_path): | |
| text = "" | |
| with pdfplumber.open(pdf_file_path) as pdf: | |
| for page in pdf.pages: | |
| text += page.extract_text() # Extract text from each page | |
| return text | |
| # Define the function to structure the model output into required fields | |
| def structure_summary_output(text): | |
| global pipe | |
| prompt = ( | |
| f"Please summarize the following information from the academic paper:\n" | |
| f"1. Context: Specify whether the study is focused on a specific industry, task or a broader, conceptual scope.\n" | |
| f"2. Research Question and Findings: Identify the main research question and summarise the key findings.\n" | |
| f"3. Theme of Research:\n" | |
| f" - Human vs. AI: Highlight any comparative advantages between humans and AI.\n" | |
| f" - Human + AI Collaboration: Indicate the type of collaboration discussed.\n" | |
| f" Note that the output for this field can only be either 'Human vs. AI' or 'Human + AI Collaboration'\n" | |
| f"4. Method: Classify the study method as one of the following:\n" | |
| f" - Conceptual/Case Study\n" | |
| f" - Modeling: Either Stylized Modeling or Operations Research (OR) Model\n" | |
| f" - Empirical Study: Lab/Field Experiment or Secondary Data Analysis\n" | |
| f" Note that the output for this field can only be either 'Conceptual/Case Study' or 'Modeling' or 'Empirical Study'\n" | |
| f"5. Contribution: Identify the primary contribution (theoretical, managerial, or methodological).\n" | |
| f"6. Future Potential and Limitations: Summarize future research directions or limitations.\n\n" | |
| f"Paper content:\n{text}\n\n" | |
| f"Respond with the answers formatted in the following structure:\n" | |
| f"- Context:\n- Research Question and Findings:\n- Theme of Research:\n- Method:\n" | |
| f"- Contribution:\n- Future Potential and Limitations:\n" | |
| ) | |
| output = pipe(prompt, max_new_tokens=512) | |
| # Extract structured text from model output | |
| summary_text = output[0]["generated_text"] | |
| # Split the text into structured sections | |
| sections = { | |
| "Context": "", | |
| "Research Question and Findings": "", | |
| "Theme of Research": "", | |
| "Method": "", | |
| "Contribution": "", | |
| "Future Potential and Limitations": "" | |
| } | |
| # Regular expression to match each section header and text that follows | |
| for section in sections.keys(): | |
| match = re.search(rf"- {section}:(.*?)(?=- [A-Z]|$)", summary_text, re.DOTALL) | |
| if match: | |
| sections[section] = match.group(1).strip() | |
| # Return the extracted sections | |
| return sections | |
| # Process each PDF and summarize | |
| def process_all_papers(pdf_directory, reference): | |
| paper_summaries = [] | |
| for paper_index in range(1, 33): | |
| pdf_file_path = os.path.join(pdf_directory, f"{paper_index}.pdf") | |
| if os.path.exists(pdf_file_path): | |
| # Extract text from the PDF using pdfplumber | |
| text = extract_text_from_pdf(pdf_file_path) | |
| # Get structured summary of the paper | |
| summary = structure_summary_output(text) | |
| # Find citation (assuming the file name contains the citation or we can get it from the reference dataframe) | |
| citation = reference.loc[reference['Index'] == paper_index, 'Citation'].values[0] | |
| # Store the information in the dictionary | |
| paper_summary = { | |
| "ID": paper_index, | |
| "Citation": citation, | |
| "Context": summary["Context"], | |
| "Research Question and Findings": summary["Research Question and Findings"], | |
| "Theme of Research": summary["Theme of Research"], | |
| "Method": summary["Method"], | |
| "Contribution": summary["Contribution"], | |
| "Future Potential and Limitations": summary["Future Potential and Limitations"] | |
| } | |
| paper_summaries.append(paper_summary) | |
| return paper_summaries | |
| def interpret_search_criteria(user_input): | |
| """ | |
| Determines search criteria based on user input text. | |
| """ | |
| theme = "" | |
| method = "" | |
| if "human vs ai" in user_input.lower(): | |
| theme = "Human vs. AI" | |
| elif "human + ai" in user_input.lower() or "collaboration" in user_input.lower(): | |
| theme = "Human + AI Collaboration" | |
| if "empirical" in user_input.lower(): | |
| method = "Empirical Study" | |
| elif "conceptual" in user_input.lower() or "case study" in user_input.lower(): | |
| method = "Conceptual/Case Study" | |
| elif "modeling" in user_input.lower(): | |
| method = "Modeling" | |
| return {"Theme": theme, "Method": method} | |
| def search_and_summarize_with_llm(paper_summaries, user_input): | |
| """ | |
| Retrieves relevant articles based on interpreted search criteria and generates a cohesive summary. | |
| """ | |
| global pipe | |
| # Interpret the search criteria from the user input | |
| search_criteria = interpret_search_criteria(user_input) | |
| # Collect all relevant summaries and citations | |
| relevant_summaries = [] | |
| citation_list = [] | |
| for summary in paper_summaries: | |
| # paper_id, citation, context, rqf, theme, method, contribution, future = summary["ID"], | |
| # Check if this summary matches all search criteria | |
| if search_criteria['Theme'].lower() in summary["Theme of Research"].lower() and search_criteria['Method'].lower() in summary["Method"].lower(): | |
| # Append the full information of this paper summary | |
| relevant_summaries.append( | |
| f"Paper ID: {summary['ID']}\nCitation: {summary['Citation']}\nContext: {summary['Context']}\n" | |
| f"Research Question and Findings: {summary['Research Question and Findings']}\nTheme of Research: {summary['Theme of Research']}\n" | |
| f"Method: {summary['Method']}\nContribution: {summary['Contribution']}\n" | |
| f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n" | |
| ) | |
| citation_list.append(summary["Citation"]) | |
| # Generate a cohesive summary using the LLM | |
| combined_text = "\n".join(relevant_summaries) | |
| prompt = ( | |
| f"Based on the following research summaries related to {search_criteria['Theme']} and {search_criteria['Method']}," | |
| f"provide a cohesive summary discussing connections, common themes, trends, and future directions:\n\n{combined_text}" | |
| ) | |
| summary_output = pipe(prompt, max_new_tokens=512) | |
| cohesive_summary = summary_output[0]["generated_text"] | |
| # Format the citation list | |
| formatted_citations = "Citations:\n" + "\n".join(f"- {citation}" for citation in citation_list) | |
| return cohesive_summary, formatted_citations | |