Spaces:

Jessie0201
/

GPTtool

Runtime error

App Files Files Community

Jessie0201 commited on Nov 12, 2024

Commit

27b16dd

verified ·

1 Parent(s): e469069

Upload 2 files

Browse files

Files changed (2) hide show

app.py +82 -0
utils.py +160 -0

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from utils import *
+import gradio as gr
+from dotenv import load_dotenv
+import os
+load_dotenv()
+# Define file paths
+reference_file_path = "./data/reference.xlsx"
+pdf_directory = "./data/pdf/"
+# Check if reference file exists
+if not os.path.exists(reference_file_path):
+    raise FileNotFoundError("Reference file not found. Please ensure 'data/reference.xlsx' exists in the workspace.")
+# Load the reference data from Excel
+reference = pd.read_excel(reference_file_path)
+paper_summaries = process_all_papers(pdf_directory, reference=reference)
+# Define the Gradio function to process and display summaries
+def display_summaries():
+    global paper_summaries
+    # Format the summaries for display in Gradio
+    formatted_summary = ""
+    for summary in paper_summaries:
+        formatted_summary += (
+            f"Paper ID: {summary['ID']}\n"
+            f"Citation: {summary['Citation']}\n"
+            f"Context: {summary['Context']}\n"
+            f"Research Question and Findings: {summary['Research Question and Findings']}\n"
+            f"Theme of Research: {summary['Theme of Research']}\n"
+            f"Method: {summary['Method']}\n"
+            f"Contribution: {summary['Contribution']}\n"
+            f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n"
+            "------------------------------------------\n\n"
+        )
+    return formatted_summary
+# Gradio function to get user input and display summaries based on criteria
+def retrieve_and_display_search_results(user_input):
+    global paper_summaries
+    # Call the search and summarize function
+    cohesive_summary, formatted_citations = search_and_summarize_with_llm(paper_summaries, user_input)
+    # Return combined summary and citations
+    return cohesive_summary + "\n\n" + formatted_citations
+# Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Academic Paper Summarization Tool")
+    gr.Markdown("Click 'Begin Summarization' to process and summarize the 32 papers.")
+    summary_output = gr.Textbox(
+        label="Summarization Output",
+        placeholder="Summaries will appear here after processing...",
+        lines=30,
+        interactive=False
+    )
+    begin_button = gr.Button("Begin Summarization")
+    begin_button.click(fn=display_summaries, inputs=None, outputs=summary_output)
+    gr.Markdown("# Research Summarization Tool")
+    gr.Markdown("Type your search criteria below (e.g., 'I want all research about human VS AI and empirical research')")
+    user_input = gr.Textbox(label="Search Criteria", placeholder="Enter your search criteria here...")
+    search_button = gr.Button("Search Relevant Articles")
+    search_output = gr.Textbox(
+        label="Search Results",
+        placeholder="Results of search will appear here...",
+        lines=30,
+        interactive=False
+    )
+    search_button.click(fn=retrieve_and_display_search_results, inputs=user_input, outputs=search_output)
+# Run the app
+demo.launch()

utils.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import pdfplumber
+import pandas as pd
+import re
+from transformers import pipeline
+import pandas as pd
+# Function to extract text from PDF using pdfplumber
+def extract_text_from_pdf(pdf_file_path):
+    text = ""
+    with pdfplumber.open(pdf_file_path) as pdf:
+        for page in pdf.pages:
+            text += page.extract_text()  # Extract text from each page
+    return text
+# Define the function to structure the model output into required fields
+def structure_summary_output(text):
+    pipe = pipeline("text-generation", model="meta-llama/Llama-2-70b-hf", use_auth_token=os.getenv("HUGGINGFACE_TOKEN"))
+    prompt = (
+        f"Please summarize the following information from the academic paper:\n"
+        f"1. Context: Specify whether the study is focused on a specific industry, task or a broader, conceptual scope.\n"
+        f"2. Research Question and Findings: Identify the main research question and summarise the key findings.\n"
+        f"3. Theme of Research:\n"
+        f"   - Human vs. AI: Highlight any comparative advantages between humans and AI.\n"
+        f"   - Human + AI Collaboration: Indicate the type of collaboration discussed.\n"
+        f"   Note that the output for this field can only be either 'Human vs. AI' or 'Human + AI Collaboration'\n"
+        f"4. Method: Classify the study method as one of the following:\n"
+        f"   - Conceptual/Case Study\n"
+        f"   - Modeling: Either Stylized Modeling or Operations Research (OR) Model\n"
+        f"   - Empirical Study: Lab/Field Experiment or Secondary Data Analysis\n"
+        f"   Note that the output for this field can only be either 'Conceptual/Case Study' or 'Modeling' or 'Empirical Study'\n"
+        f"5. Contribution: Identify the primary contribution (theoretical, managerial, or methodological).\n"
+        f"6. Future Potential and Limitations: Summarize future research directions or limitations.\n\n"
+        f"Paper content:\n{text}\n\n"
+        f"Respond with the answers formatted in the following structure:\n"
+        f"- Context:\n- Research Question and Findings:\n- Theme of Research:\n- Method:\n"
+        f"- Contribution:\n- Future Potential and Limitations:\n"
+    )
+    output = pipe(prompt, max_new_tokens=512)
+    # Extract structured text from model output
+    summary_text = output[0]["generated_text"]
+    # Split the text into structured sections
+    sections = {
+        "Context": "",
+        "Research Question and Findings": "",
+        "Theme of Research": "",
+        "Method": "",
+        "Contribution": "",
+        "Future Potential and Limitations": ""
+    }
+    # Regular expression to match each section header and text that follows
+    for section in sections.keys():
+        match = re.search(rf"- {section}:(.*?)(?=- [A-Z]|$)", summary_text, re.DOTALL)
+        if match:
+            sections[section] = match.group(1).strip()
+    # Return the extracted sections
+    return sections
+# Process each PDF and summarize
+def process_all_papers(pdf_directory, reference):
+    paper_summaries = []
+    for paper_index in range(1, 33):
+        pdf_file_path = os.path.join(pdf_directory, f"{paper_index}.pdf")
+        if os.path.exists(pdf_file_path):
+            # Extract text from the PDF using pdfplumber
+            text = extract_text_from_pdf(pdf_file_path)
+            # Get structured summary of the paper
+            summary = structure_summary_output(text)
+            # Find citation (assuming the file name contains the citation or we can get it from the reference dataframe)
+            citation = reference.loc[reference['Index'] == paper_index, 'Citation'].values[0]
+            # Store the information in the dictionary
+            paper_summary = {
+                "ID": paper_index,
+                "Citation": citation,
+                "Context": summary["Context"],
+                "Research Question and Findings": summary["Research Question and Findings"],
+                "Theme of Research": summary["Theme of Research"],
+                "Method": summary["Method"],
+                "Contribution": summary["Contribution"],
+                "Future Potential and Limitations": summary["Future Potential and Limitations"]
+            }
+            paper_summaries.append(paper_summary)
+    return paper_summaries
+def interpret_search_criteria(user_input):
+    """
+    Determines search criteria based on user input text.
+    """
+    theme = ""
+    method = ""
+    if "human vs ai" in user_input.lower():
+        theme = "Human vs. AI"
+    elif "human + ai" in user_input.lower() or "collaboration" in user_input.lower():
+        theme = "Human + AI Collaboration"
+    if "empirical" in user_input.lower():
+        method = "Empirical Study"
+    elif "conceptual" in user_input.lower() or "case study" in user_input.lower():
+        method = "Conceptual/Case Study"
+    elif "modeling" in user_input.lower():
+        method = "Modeling"
+    return {"Theme": theme, "Method": method}
+def search_and_summarize_with_llm(paper_summaries, user_input):
+    """
+    Retrieves relevant articles based on interpreted search criteria and generates a cohesive summary.
+    """
+    # Interpret the search criteria from the user input
+    search_criteria = interpret_search_criteria(user_input)
+    # Collect all relevant summaries and citations
+    relevant_summaries = []
+    citation_list = []
+    for summary in paper_summaries:
+        # paper_id, citation, context, rqf, theme, method, contribution, future = summary["ID"],
+        # Check if this summary matches all search criteria
+        if search_criteria['Theme'].lower() in summary["Theme of Research"].lower() and search_criteria['Method'].lower() in summary["Method"].lower():
+            # Append the full information of this paper summary
+            relevant_summaries.append(
+            f"Paper ID: {summary['ID']}\nCitation: {summary['Citation']}\nContext: {summary['Context']}\n"
+            f"Research Question and Findings: {summary['Research Question and Findings']}\nTheme of Research: {summary['Theme of Research']}\n"
+            f"Method: {summary['Method']}\nContribution: {summary['Contribution']}\n"
+            f"Future Potential and Limitations: {summary['Future Potential and Limitations']}\n\n"
+            )
+            citation_list.append(summary["Citation"])
+    # Generate a cohesive summary using the LLM
+    combined_text = "\n".join(relevant_summaries)
+    pipe = pipe = pipeline("text-generation", model="meta-llama/Llama-2-70b-hf", use_auth_token=os.getenv("HUGGINGFACE_TOKEN"))
+    prompt = (
+        f"Based on the following research summaries related to {search_criteria['Theme']} and {search_criteria['Method']},"
+        f"provide a cohesive summary discussing connections, common themes, trends, and future directions:\n\n{combined_text}"
+    )
+    summary_output = pipe(prompt, max_new_tokens=512)
+    cohesive_summary = summary_output[0]["generated_text"]
+    # Format the citation list
+    formatted_citations = "Citations:\n" + "\n".join(f"- {citation}" for citation in citation_list)
+    return cohesive_summary, formatted_citations