| import streamlit as st | |
| import os | |
| from PyPDF2 import PdfReader | |
| import openpyxl | |
| from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline | |
| os.environ['GOOGLE_API_KEY'] = 'AIzaSyD8uzXToT4I2ABs7qo_XiuKh8-L2nuWCEM' | |
| def get_pdf_text(pdf_docs): | |
| text = "" | |
| for pdf in pdf_docs: | |
| pdf_reader = PdfReader(pdf) | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def get_excel_text(excel_docs): | |
| text = "" | |
| for excel_doc in excel_docs: | |
| workbook = openpyxl.load_workbook(filename=excel_doc) | |
| for sheet in workbook: | |
| for row in sheet: | |
| for cell in row: | |
| text += str(cell.value) + " " | |
| return text.strip() | |
| def get_user_input(user_question, qa_pipeline): | |
| with st.container(): | |
| response = qa_pipeline(question=user_question, context=st.session_state.raw_text) | |
| st.write("Answer:", response["answer"]) | |
| def main(): | |
| st.set_page_config("DocChat") | |
| st.header("DocChat - Chat with multiple documents") | |
| st.write("---") | |
| qa_pipeline = None | |
| with st.container(): | |
| with st.sidebar: | |
| st.title("Settings") | |
| st.subheader("Upload Documents") | |
| st.markdown("**PDF files:**") | |
| pdf_docs = st.file_uploader("Upload PDF Files", accept_multiple_files=True) | |
| if st.button("Process PDF file"): | |
| with st.spinner("Processing PDFs..."): | |
| raw_text = get_pdf_text(pdf_docs) | |
| st.session_state.raw_text = raw_text | |
| st.success("PDF processed successfully!") | |
| st.markdown("**Excel files:**") | |
| excel_docs = st.file_uploader("Upload Excel Files", accept_multiple_files=True) | |
| if st.button("Process Excel file"): | |
| with st.spinner("Processing Excel files..."): | |
| raw_text = get_excel_text(excel_docs) | |
| st.session_state.raw_text = raw_text | |
| st.success("Excel file processed successfully!") | |
| with st.container(): | |
| st.subheader("Document Q&A") | |
| st.write('Ask a question : ') | |
| user_question = st.text_input("Ask a Question from the document") | |
| if user_question: | |
| if not qa_pipeline and "raw_text" in st.session_state: | |
| model_name = "HanNayeoniee/LHK_DPO_v1" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForQuestionAnswering.from_pretrained(model_name) | |
| qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer) | |
| if qa_pipeline: | |
| get_user_input(user_question, qa_pipeline) | |
| if __name__ == "__main__": | |
| main() | |