File size: 3,187 Bytes
bdac891
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import streamlit as st
# import langchain
import PyPDF2
import os
from transformers import BartTokenizer , BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")


def save_uploaded_file(uploaded_file):
    temp_dir = "temp_files"
    os.makedirs(temp_dir, exist_ok=True)
    file_path = os.path.join(temp_dir, uploaded_file.name)
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())
    return file_path

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    RP_file = save_uploaded_file(pdf_file)
    with open(RP_file, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

def generate_summary(text: str):
    # Tokenize the text
    tokens = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(tokens.input_ids, num_beams = 4, max_length = 200, early_stopping = True)


    return summary_ids




# Function to summarize text
def summarize_text(text: str) -> str:
    summary_ids = generate_summary(text)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_ip_tokenization_spaces=False)
    return summary

# Function to extract key information from the paper
def extract_paper_info(text):
    # Logic to extract key information from the paper (e.g., using regex, NLP techniques)
    # This part can be expanded based on the specific requirements
    pass


# Function to build and fine-tune the chatbot
def build_chatbot():
    # Fine-tuning language model for chatbot using Langchain
    lang_model = ''

    # Additional fine-tuning steps can be added here

    return lang_model


# Main function to run the Streamlit app
def main():
    st.title("Research Paper Understanding Chatbot")
    st.write("As of now supports only summarization.")

    # Upload PDF file
    uploaded_file = st.file_uploader("Upload a research paper (PDF)", type="pdf")

    if uploaded_file is not None:
        st.write("Paper uploaded successfully!")

        # Extract text from PDF
        text = extract_text_from_pdf(uploaded_file)

        # Display summary of the paper
        st.subheader("Summary of the Paper")
        with st.spinner("Brewing a potion for your paper's essence..."):
            summary = summarize_text(text)
            st.write(summary)

        # # Extract key information from the paper
        # st.subheader("Key Information")
        # paper_info = extract_paper_info(text)
        # st.write(paper_info)

        # # Build chatbot
        # st.subheader("Chatbot")
        # chatbot = build_chatbot()

        # # Chat interface
        # user_input = st.text_input("You: ")
        # if user_input:
        #     response = chatbot.generate_response(user_input)
        #     st.write("Chatbot:", response)

    else:
        st.write("Please upload a PDF file")


if __name__ == "__main__":
    main()