import streamlit as st import openai import os import json from io import StringIO from PyPDF2 import PdfReader from docx import Document import html2text # Optional: Prevent config issues on HF Spaces os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit" # Configure Streamlit page st.set_page_config(page_title="Document Parser", layout="wide") # Session state to hold chat history if "conversation" not in st.session_state: st.session_state.conversation = [] # Sidebar settings with st.sidebar: st.title("Settings") api_key = st.text_input("OpenAI API Key", type="password") temperature = st.slider("Temperature", 0.0, 1.0, 0.3, 0.1) # Main UI st.title("Document Parser") st.markdown("Upload documents and ask questions using GPT.") # File uploader uploaded_files = st.file_uploader( "Upload Documents (PDF, DOCX, TXT, HTML)", type=["pdf", "docx", "txt", "html"], accept_multiple_files=True ) def extract_text(file): ext = file.name.lower().split(".")[-1] if ext == "pdf": reader = PdfReader(file) return "\n".join(page.extract_text() for page in reader.pages if page.extract_text()) elif ext == "docx": doc = Document(file) return "\n".join([para.text for para in doc.paragraphs]) elif ext == "txt": return file.read().decode("utf-8") elif ext == "html": return html2text.html2text(file.read().decode("utf-8")) else: return "" # Input field question = st.text_input("Ask a question about the uploaded documents:") # When "Ask" button is clicked if st.button("Ask") and uploaded_files and question and api_key: with st.spinner("Processing..."): # Extract and combine text from all uploaded files combined_text = "" for file in uploaded_files: combined_text += extract_text(file) + "\n" if not combined_text.strip(): st.warning("Could not extract text from uploaded files.") else: try: openai.api_key = api_key response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "You are a helpful assistant that answers questions based on uploaded documents."}, {"role": "user", "content": f"DOCUMENT:\n{combined_text[:6000]}\n\nQUESTION:\n{question}"} ], temperature=temperature, ) answer = response["choices"][0]["message"]["content"] # Update conversation history st.session_state.conversation.append({"role": "user", "content": question}) st.session_state.conversation.append({"role": "assistant", "content": answer}) except Exception as e: st.error(f"Error from OpenAI: {e}") # Display conversation if st.session_state.conversation: st.markdown("## Conversation") for msg in st.session_state.conversation: st.markdown(f"**{'You' if msg['role'] == 'user' else 'Assistant'}:** {msg['content']}") st.markdown("---") col1, col2 = st.columns(2) with col1: if st.button("Clear Conversation"): st.session_state.conversation = [] st.experimental_rerun() with col2: format = st.selectbox("Download Format", ["TXT", "JSON"]) if format == "TXT": content = "\n\n".join( f"{msg['role'].capitalize()}:\n{msg['content']}" for msg in st.session_state.conversation ) mime = "text/plain" filename = "conversation.txt" else: content = json.dumps(st.session_state.conversation, indent=2) mime = "application/json" filename = "conversation.json" st.download_button("Download", content, file_name=filename, mime=mime)