Spaces:

elia-waefler
/

classify_ASH

Sleeping

File size: 9,753 Bytes

import time
import streamlit as st
import os
# import openai
from PyPDF2 import PdfReader
from openai import OpenAI
from langchain.chat_models import ChatOpenAI
ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]


def gpt4_new(prompt_text):
    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "system",
                   "content":   "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
                                "das Dokument in vorgegebene Kategorien klassifiziert."
                                "Du gibts möglichst kurze Antworten, am besten ein Wort"
                                "Du gibst keine Erklärungen oder Begründungen. "
                                "Du klassifizierst nur nach den vorgegebenen Kategorien."
                                "Wenn ein Dokument partout nicht klassifizierbar ist, "
                                "antwortest du mit '<no classification>'"},
                  {"role": "user", "content": prompt_text}])
    return response.choices[0].message.content


# nicht aktuell
def ask_gpt4(question):
    print(question)  # we don't have to submit the question?
    try:
        # Use the chat function to send a message and get a response
        response = ChatOpenAI()
        # Extract the response text
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        # Handle exceptions that may occur during the API call
        return str(e)


def process_prompts_and_save(my_prompts):
    # Ensure the responses list is empty initially
    responses = []

    # Loop through each prompt in the list
    for prompt in my_prompts:
        try:
            # ADD LOGIC TO READ FILE AND CLASSIFY
            # Generate response for each prompt and append to the list
            response = ask_gpt4(prompt)
            sol = f"{prompt}\n\n{response}\n\n\n\n"
            print(sol)
            responses.append(sol)
        except Exception as e:
            # In case of an error, log the error with the prompt
            responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")

    # Writing all responses to a text file
    with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
        file.writelines(responses)


def get_pdfs_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def get_pdf_text(pdf_document):
    text = ""
    pdf_reader = PdfReader(pdf_document)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text


def json_open(filename):
    with open(filename, "r") as f:
        mydata = f.read()
    return mydata


def main():
    st.title("Doc Classifier")
    if st.toggle("show README"):
        st.subheader("Funktion: ")
        st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren. lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren. Feedback und Bugs gerne an elia.waefler@insel.ch")
        st.write("Vielen Dank.")
        st.write("")
        st.subheader("Licence and credits")
        st.write("THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.")
        st.write("special thanks to OpenAI, Huggingface, Streamlit")
        l, r = st.columns(2)
        with l:
            st.subheader("Limitationen: ")
            st.write("bisher nur PDFs")
            st.write("nur Disziplin, Doc typ. und Geschoss")
            st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
            st.write("")
        with r:
            st.subheader("geplante Erweiterungen:")
            st.write("Text Beschreibung wird von AI hinzugefügt")
            st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
            st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
    if "login" not in st.session_state:
        st.session_state.login = False

    if st.session_state.login:
        uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)

        # print(uploaded_file)
        # print(uploaded_file.name)
        if st.button("classify KBOB!"):
            if uploaded_files is not None:
                with st.container():
                    # col1, col2, col3, col4, col5 = st.columns(5)
                    col1, col2, col3 = st.columns(3)
                    all_metadata = []
                    with col1:
                        st.write("Disziplin")
                        st.write(f"")
                    with col2:
                        st.write("Dokumententyp")
                        st.write(f"")
                    with col3:
                        st.write("Geschoss")
                        st.write(f"")
                    for file in uploaded_files:
                        metadata = []
                        metadata.append(str(file.name))
                        with col1:
                            with st.spinner("GPT4 at work"):
                                pdf_text = str(get_pdf_text(file))
                                prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
                                try:
                                    answer_1 = gpt4_new(prompt_1)
                                except:
                                    answer_1 = "<err_no_classification>"
                                print(prompt_1)
                                metadata.append(str(answer_1))
                            st.write(answer_1)
                        with col2:
                            with st.spinner("GPT4 at work"):
                                prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
                                try:
                                    answer_2 = gpt4_new(prompt_2)
                                except:
                                    answer_2 = "<err_no_classification>"
                                print(prompt_2)
                                metadata.append(str(answer_2))

                            st.write(answer_2)
                        with col3:
                            with st.spinner("GPT4 at work"):
                                prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
                                try:
                                    answer_3 = gpt4_new(prompt_3)
                                except:
                                    answer_3 = "<err_no_classification>"
                                print(prompt_3)
                                metadata.append(str(answer_3))

                            st.write(answer_3)
                        all_metadata.append(metadata)

                    metadata_filename = "ai_generated_metadata.txt"
                    with open(metadata_filename, 'w', encoding='utf-8') as f:
                        for line in all_metadata:
                            f.writelines("\n")
                            for item in line:
                                f.writelines(item)
                                f.writelines(";")

                            f.writelines("\n")

                    st.success("classified, saved")
                    st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
            else:
                st.warning("no file")

    else:
        user_pw = st.text_input("ASK_ASH_PASSWORD: ", type="password")
        if st.button("check"):
            time.sleep(0.5)
            if user_pw == ASK_ASH_PASSWORD:
                st.session_state.login = True
                st.rerun()


if __name__ == "__main__":
    #prompts = ["classify the document, tell me the ", "hello"]
    #process_prompts_and_save(prompts)
    auftrag_0 = "Klassifiziere dieses Dokument nach "
    auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
    auftrag_1_type = "diesen 'Dokumententypen': "
    auftrag_1_ge = "diesen 'Geschossen': "
    Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
                               'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
                               'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
                               'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
                               'Z-Lichtplanung']
    auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
                "Keine weiteren Ausführungen oder Erklärungen. " \
                "Antworte am besten in einem Wort. " \
                "Hier der Dokumenteninhalt: "
    Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
                       'Prozessdokumentation',  'Fachdokumentation', 'Anlagedokumentation']
    ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
                     'A', 'B', 'C', 'D', 'E', 'F', 'G']
    #print(str(Baubranchen_Disziplinen))
    main()