File size: 3,912 Bytes
447c4d7
c6c3565
 
447c4d7
c6c3565
 
 
 
447c4d7
c6c3565
 
 
 
447c4d7
 
c6c3565
447c4d7
 
 
c6c3565
447c4d7
714614e
 
 
447c4d7
c6c3565
714614e
c6c3565
447c4d7
c6c3565
447c4d7
714614e
c6c3565
447c4d7
 
 
c6c3565
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714614e
c6c3565
 
714614e
 
c6c3565
 
 
 
 
447c4d7
c6c3565
714614e
c6c3565
 
 
 
714614e
c6c3565
 
 
 
 
 
 
 
 
 
 
447c4d7
c6c3565
714614e
447c4d7
c6c3565
447c4d7
714614e
447c4d7
c6c3565
447c4d7
 
 
 
 
714614e
447c4d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714614e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import streamlit as st
import openai
import os
import json
from io import StringIO
from PyPDF2 import PdfReader
from docx import Document
import html2text

# Optional: Prevent config issues on HF Spaces
os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit"

# Configure Streamlit page
st.set_page_config(page_title="Document Parser", layout="wide")

# Session state to hold chat history
if "conversation" not in st.session_state:
    st.session_state.conversation = []

# Sidebar settings
with st.sidebar:
    st.title("Settings")
    api_key = st.text_input("OpenAI API Key", type="password")
    temperature = st.slider("Temperature", 0.0, 1.0, 0.3, 0.1)

# Main UI
st.title("Document Parser")
st.markdown("Upload documents and ask questions using GPT.")

# File uploader
uploaded_files = st.file_uploader(
    "Upload Documents (PDF, DOCX, TXT, HTML)",
    type=["pdf", "docx", "txt", "html"],
    accept_multiple_files=True
)

def extract_text(file):
    ext = file.name.lower().split(".")[-1]
    if ext == "pdf":
        reader = PdfReader(file)
        return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
    elif ext == "docx":
        doc = Document(file)
        return "\n".join([para.text for para in doc.paragraphs])
    elif ext == "txt":
        return file.read().decode("utf-8")
    elif ext == "html":
        return html2text.html2text(file.read().decode("utf-8"))
    else:
        return ""

# Input field
question = st.text_input("Ask a question about the uploaded documents:")

# When "Ask" button is clicked
if st.button("Ask") and uploaded_files and question and api_key:
    with st.spinner("Processing..."):

        # Extract and combine text from all uploaded files
        combined_text = ""
        for file in uploaded_files:
            combined_text += extract_text(file) + "\n"

        if not combined_text.strip():
            st.warning("Could not extract text from uploaded files.")
        else:
            try:
                openai.api_key = api_key
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant that answers questions based on uploaded documents."},
                        {"role": "user", "content": f"DOCUMENT:\n{combined_text[:6000]}\n\nQUESTION:\n{question}"}
                    ],
                    temperature=temperature,
                )
                answer = response["choices"][0]["message"]["content"]

                # Update conversation history
                st.session_state.conversation.append({"role": "user", "content": question})
                st.session_state.conversation.append({"role": "assistant", "content": answer})

            except Exception as e:
                st.error(f"Error from OpenAI: {e}")

# Display conversation
if st.session_state.conversation:
    st.markdown("## Conversation")
    for msg in st.session_state.conversation:
        st.markdown(f"**{'You' if msg['role'] == 'user' else 'Assistant'}:** {msg['content']}")

    st.markdown("---")
    col1, col2 = st.columns(2)

    with col1:
        if st.button("Clear Conversation"):
            st.session_state.conversation = []
            st.experimental_rerun()

    with col2:
        format = st.selectbox("Download Format", ["TXT", "JSON"])
        if format == "TXT":
            content = "\n\n".join(
                f"{msg['role'].capitalize()}:\n{msg['content']}" for msg in st.session_state.conversation
            )
            mime = "text/plain"
            filename = "conversation.txt"
        else:
            content = json.dumps(st.session_state.conversation, indent=2)
            mime = "application/json"
            filename = "conversation.json"

        st.download_button("Download", content, file_name=filename, mime=mime)