Spaces:

jasvir-singh1021
/

Easy-data-parser

Sleeping

File size: 3,912 Bytes

import streamlit as st
import openai
import os
import json
from io import StringIO
from PyPDF2 import PdfReader
from docx import Document
import html2text

# Optional: Prevent config issues on HF Spaces
os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit"

# Configure Streamlit page
st.set_page_config(page_title="Document Parser", layout="wide")

# Session state to hold chat history
if "conversation" not in st.session_state:
    st.session_state.conversation = []

# Sidebar settings
with st.sidebar:
    st.title("Settings")
    api_key = st.text_input("OpenAI API Key", type="password")
    temperature = st.slider("Temperature", 0.0, 1.0, 0.3, 0.1)

# Main UI
st.title("Document Parser")
st.markdown("Upload documents and ask questions using GPT.")

# File uploader
uploaded_files = st.file_uploader(
    "Upload Documents (PDF, DOCX, TXT, HTML)",
    type=["pdf", "docx", "txt", "html"],
    accept_multiple_files=True
)

def extract_text(file):
    ext = file.name.lower().split(".")[-1]
    if ext == "pdf":
        reader = PdfReader(file)
        return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
    elif ext == "docx":
        doc = Document(file)
        return "\n".join([para.text for para in doc.paragraphs])
    elif ext == "txt":
        return file.read().decode("utf-8")
    elif ext == "html":
        return html2text.html2text(file.read().decode("utf-8"))
    else:
        return ""

# Input field
question = st.text_input("Ask a question about the uploaded documents:")

# When "Ask" button is clicked
if st.button("Ask") and uploaded_files and question and api_key:
    with st.spinner("Processing..."):

        # Extract and combine text from all uploaded files
        combined_text = ""
        for file in uploaded_files:
            combined_text += extract_text(file) + "\n"

        if not combined_text.strip():
            st.warning("Could not extract text from uploaded files.")
        else:
            try:
                openai.api_key = api_key
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant that answers questions based on uploaded documents."},
                        {"role": "user", "content": f"DOCUMENT:\n{combined_text[:6000]}\n\nQUESTION:\n{question}"}
                    ],
                    temperature=temperature,
                )
                answer = response["choices"][0]["message"]["content"]

                # Update conversation history
                st.session_state.conversation.append({"role": "user", "content": question})
                st.session_state.conversation.append({"role": "assistant", "content": answer})

            except Exception as e:
                st.error(f"Error from OpenAI: {e}")

# Display conversation
if st.session_state.conversation:
    st.markdown("## Conversation")
    for msg in st.session_state.conversation:
        st.markdown(f"**{'You' if msg['role'] == 'user' else 'Assistant'}:** {msg['content']}")

    st.markdown("---")
    col1, col2 = st.columns(2)

    with col1:
        if st.button("Clear Conversation"):
            st.session_state.conversation = []
            st.experimental_rerun()

    with col2:
        format = st.selectbox("Download Format", ["TXT", "JSON"])
        if format == "TXT":
            content = "\n\n".join(
                f"{msg['role'].capitalize()}:\n{msg['content']}" for msg in st.session_state.conversation
            )
            mime = "text/plain"
            filename = "conversation.txt"
        else:
            content = json.dumps(st.session_state.conversation, indent=2)
            mime = "application/json"
            filename = "conversation.json"

        st.download_button("Download", content, file_name=filename, mime=mime)