Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import openai | |
| import os | |
| import json | |
| from io import StringIO | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| import html2text | |
| # Optional: Prevent config issues on HF Spaces | |
| os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit" | |
| # Configure Streamlit page | |
| st.set_page_config(page_title="Document Parser", layout="wide") | |
| # Session state to hold chat history | |
| if "conversation" not in st.session_state: | |
| st.session_state.conversation = [] | |
| # Sidebar settings | |
| with st.sidebar: | |
| st.title("Settings") | |
| api_key = st.text_input("OpenAI API Key", type="password") | |
| temperature = st.slider("Temperature", 0.0, 1.0, 0.3, 0.1) | |
| # Main UI | |
| st.title("Document Parser") | |
| st.markdown("Upload documents and ask questions using GPT.") | |
| # File uploader | |
| uploaded_files = st.file_uploader( | |
| "Upload Documents (PDF, DOCX, TXT, HTML)", | |
| type=["pdf", "docx", "txt", "html"], | |
| accept_multiple_files=True | |
| ) | |
| def extract_text(file): | |
| ext = file.name.lower().split(".")[-1] | |
| if ext == "pdf": | |
| reader = PdfReader(file) | |
| return "\n".join(page.extract_text() for page in reader.pages if page.extract_text()) | |
| elif ext == "docx": | |
| doc = Document(file) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| elif ext == "txt": | |
| return file.read().decode("utf-8") | |
| elif ext == "html": | |
| return html2text.html2text(file.read().decode("utf-8")) | |
| else: | |
| return "" | |
| # Input field | |
| question = st.text_input("Ask a question about the uploaded documents:") | |
| # When "Ask" button is clicked | |
| if st.button("Ask") and uploaded_files and question and api_key: | |
| with st.spinner("Processing..."): | |
| # Extract and combine text from all uploaded files | |
| combined_text = "" | |
| for file in uploaded_files: | |
| combined_text += extract_text(file) + "\n" | |
| if not combined_text.strip(): | |
| st.warning("Could not extract text from uploaded files.") | |
| else: | |
| try: | |
| openai.api_key = api_key | |
| response = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant that answers questions based on uploaded documents."}, | |
| {"role": "user", "content": f"DOCUMENT:\n{combined_text[:6000]}\n\nQUESTION:\n{question}"} | |
| ], | |
| temperature=temperature, | |
| ) | |
| answer = response["choices"][0]["message"]["content"] | |
| # Update conversation history | |
| st.session_state.conversation.append({"role": "user", "content": question}) | |
| st.session_state.conversation.append({"role": "assistant", "content": answer}) | |
| except Exception as e: | |
| st.error(f"Error from OpenAI: {e}") | |
| # Display conversation | |
| if st.session_state.conversation: | |
| st.markdown("## Conversation") | |
| for msg in st.session_state.conversation: | |
| st.markdown(f"**{'You' if msg['role'] == 'user' else 'Assistant'}:** {msg['content']}") | |
| st.markdown("---") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("Clear Conversation"): | |
| st.session_state.conversation = [] | |
| st.experimental_rerun() | |
| with col2: | |
| format = st.selectbox("Download Format", ["TXT", "JSON"]) | |
| if format == "TXT": | |
| content = "\n\n".join( | |
| f"{msg['role'].capitalize()}:\n{msg['content']}" for msg in st.session_state.conversation | |
| ) | |
| mime = "text/plain" | |
| filename = "conversation.txt" | |
| else: | |
| content = json.dumps(st.session_state.conversation, indent=2) | |
| mime = "application/json" | |
| filename = "conversation.json" | |
| st.download_button("Download", content, file_name=filename, mime=mime) | |