Spaces:
Sleeping
Sleeping
File size: 3,912 Bytes
447c4d7 c6c3565 447c4d7 c6c3565 447c4d7 c6c3565 447c4d7 c6c3565 447c4d7 c6c3565 447c4d7 714614e 447c4d7 c6c3565 714614e c6c3565 447c4d7 c6c3565 447c4d7 714614e c6c3565 447c4d7 c6c3565 714614e c6c3565 714614e c6c3565 447c4d7 c6c3565 714614e c6c3565 714614e c6c3565 447c4d7 c6c3565 714614e 447c4d7 c6c3565 447c4d7 714614e 447c4d7 c6c3565 447c4d7 714614e 447c4d7 714614e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import streamlit as st
import openai
import os
import json
from io import StringIO
from PyPDF2 import PdfReader
from docx import Document
import html2text
# Optional: Prevent config issues on HF Spaces
os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit"
# Configure Streamlit page
st.set_page_config(page_title="Document Parser", layout="wide")
# Session state to hold chat history
if "conversation" not in st.session_state:
st.session_state.conversation = []
# Sidebar settings
with st.sidebar:
st.title("Settings")
api_key = st.text_input("OpenAI API Key", type="password")
temperature = st.slider("Temperature", 0.0, 1.0, 0.3, 0.1)
# Main UI
st.title("Document Parser")
st.markdown("Upload documents and ask questions using GPT.")
# File uploader
uploaded_files = st.file_uploader(
"Upload Documents (PDF, DOCX, TXT, HTML)",
type=["pdf", "docx", "txt", "html"],
accept_multiple_files=True
)
def extract_text(file):
ext = file.name.lower().split(".")[-1]
if ext == "pdf":
reader = PdfReader(file)
return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
elif ext == "docx":
doc = Document(file)
return "\n".join([para.text for para in doc.paragraphs])
elif ext == "txt":
return file.read().decode("utf-8")
elif ext == "html":
return html2text.html2text(file.read().decode("utf-8"))
else:
return ""
# Input field
question = st.text_input("Ask a question about the uploaded documents:")
# When "Ask" button is clicked
if st.button("Ask") and uploaded_files and question and api_key:
with st.spinner("Processing..."):
# Extract and combine text from all uploaded files
combined_text = ""
for file in uploaded_files:
combined_text += extract_text(file) + "\n"
if not combined_text.strip():
st.warning("Could not extract text from uploaded files.")
else:
try:
openai.api_key = api_key
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant that answers questions based on uploaded documents."},
{"role": "user", "content": f"DOCUMENT:\n{combined_text[:6000]}\n\nQUESTION:\n{question}"}
],
temperature=temperature,
)
answer = response["choices"][0]["message"]["content"]
# Update conversation history
st.session_state.conversation.append({"role": "user", "content": question})
st.session_state.conversation.append({"role": "assistant", "content": answer})
except Exception as e:
st.error(f"Error from OpenAI: {e}")
# Display conversation
if st.session_state.conversation:
st.markdown("## Conversation")
for msg in st.session_state.conversation:
st.markdown(f"**{'You' if msg['role'] == 'user' else 'Assistant'}:** {msg['content']}")
st.markdown("---")
col1, col2 = st.columns(2)
with col1:
if st.button("Clear Conversation"):
st.session_state.conversation = []
st.experimental_rerun()
with col2:
format = st.selectbox("Download Format", ["TXT", "JSON"])
if format == "TXT":
content = "\n\n".join(
f"{msg['role'].capitalize()}:\n{msg['content']}" for msg in st.session_state.conversation
)
mime = "text/plain"
filename = "conversation.txt"
else:
content = json.dumps(st.session_state.conversation, indent=2)
mime = "application/json"
filename = "conversation.json"
st.download_button("Download", content, file_name=filename, mime=mime)
|