File size: 3,942 Bytes
f372a5b
 
 
 
 
 
ab5a702
 
c69a0b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f372a5b
 
 
 
 
 
 
1224539
 
ef3a9ce
f372a5b
 
 
 
 
 
 
1224539
 
 
 
 
 
 
f372a5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import streamlit as st
import pandas as pd
from huggify_data.scrape_modules import PDFQnAGenerator
import tempfile

def main():
    st.set_page_config(layout="wide")
    st.title("PDF Question-Answer Generator using Huggify-Data Package")

    # Expander in the sidebar for instruction
    with st.sidebar.expander("Instruction"):
        st.write('''
        πŸŽ‰ Introducing Huggify-Data: Your Ultimate PDF Data Scraping and Uploading Tool! πŸŽ‰
        
        πŸ‘‹ I'm thrilled to present the new user-friendly interface for my Python package, huggify-data. This powerful tool simplifies the process of scraping data from PDFs and generating question and answer pairs using OpenAI, making it perfect for building conversational chatbots. πŸ€–βœ¨
        
        πŸš€ Key Features:
        1. Easy PDF Data Extraction: Quickly scrape text content from PDFs and convert it into a structured data frame.
        2. Automated Question-Answer Pair Generation: Extract meaningful question-answer pairs from your PDF content, ideal for training chatbots.
        3. User-Friendly Interface: Interact with the package without any programming experience, making information accessibility easier and more efficient.
        
        πŸ”§ How It Works:
        - API Key: Add your OpenAI API Key.
        - Load Your PDF: Easily load any PDF file into the library.
        - Just wait: Wait and download the `.csv` from the app.
        
        πŸ“ˆ Why Huggify-Data?
        Whether you're a data scientist, developer, or AI enthusiast, Huggify-Data streamlines the process of preparing your PDF data for AI applications. It's never been easier to transform your PDFs into valuable datasets for building conversational AI models.
        
        πŸ”— Links:
        - **GitHub Repository**: [https://lnkd.in/eJEJebcw](https://lnkd.in/eJEJebcw)
        - **Documentation**: [https://lnkd.in/eF9JFXAP](https://lnkd.in/eF9JFXAP)
        - **Notebook**: [https://lnkd.in/eaA2qaPt](https://lnkd.in/eaA2qaPt)
        - **App**: [https://huggingface.co/spaces/eagle0504/huggify-data](https://huggingface.co/spaces/eagle0504/huggify-data)
        
        Don't forget to like, comment, and subscribe for more updates and tutorials on AI and data science! πŸ‘πŸ””
        
        #HuggifyData #PythonLibrary #AI #DataScience #HuggingFace #PDFScraping #Chatbot #OpenSource #Yiqiao
        ''')
    
    # Sidebar for uploading the PDF file
    st.sidebar.title("Upload PDF")
    uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
    
    # Text input for OpenAI API key
    openai_api_key = st.sidebar.text_input("Enter your OpenAI API key", type="password")

    # Embed YouTube video in the sidebar
    st.sidebar.video("https://youtu.be/CfMcw4OTLCQ")
    
    if uploaded_file is not None and openai_api_key:
        # Save the uploaded PDF to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
            temp_pdf.write(uploaded_file.read())
            temp_pdf_path = temp_pdf.name
        
        # Show a spinner while processing the PDF
        with st.spinner('Processing the PDF and generating questions and answers...'):
            # Process the PDF and generate the questions and answers
            generator = PDFQnAGenerator(temp_pdf_path, openai_api_key)
            generator.process_scraped_content()
            generator.generate_questions_answers()
            df = generator.convert_to_dataframe()
        
        # Display the resulting DataFrame
        st.subheader("Generated Question-Answer Pairs")
        st.write(df)
        
        # Option to download the DataFrame as a CSV
        csv = df.to_csv(index=False).encode('utf-8')
        st.download_button(
            label="Download as CSV",
            data=csv,
            file_name='questions_answers.csv',
            mime='text/csv',
        )

if __name__ == "__main__":
    main()