File size: 9,753 Bytes
9338c7b
3bd2065
c686e31
045155e
3bd2065
c686e31
3bd2065
f0134bc
 
3bd2065
c686e31
 
 
 
 
7f36095
 
 
 
 
 
 
c686e31
 
 
 
9d007c3
c686e31
045155e
c686e31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3bd2065
 
 
 
 
 
 
 
c686e31
 
 
 
 
 
3bd2065
 
c686e31
 
 
 
3bd2065
 
 
c686e31
2d01f52
 
9872eed
2d01f52
045155e
9872eed
 
 
7c520ae
2d01f52
 
 
 
 
 
 
 
 
 
 
9338c7b
 
 
 
7c520ae
 
 
 
2e64cea
f0134bc
 
 
 
 
 
 
 
 
045155e
f0134bc
 
045155e
f0134bc
 
9d007c3
 
f0134bc
 
 
045155e
3396745
 
 
 
f0134bc
9d007c3
f0134bc
 
 
045155e
3396745
 
 
 
f0134bc
9d007c3
f0134bc
 
 
 
045155e
3396745
 
 
 
f0134bc
9d007c3
f0134bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e64cea
 
3bd2065
7c520ae
 
 
 
 
 
 
 
3bd2065
c686e31
 
 
7f36095
045155e
 
 
2e64cea
 
 
 
 
 
 
 
 
c686e31
 
7f36095
 
21bcf26
3bd2065
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import time
import streamlit as st
import os
# import openai
from PyPDF2 import PdfReader
from openai import OpenAI
from langchain.chat_models import ChatOpenAI
ASK_ASH_PASSWORD = os.environ["ASK_ASH_PASSWORD"]


def gpt4_new(prompt_text):
    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "system",
                   "content":   "Du bist eine Maschine, auf Grund des Texts von PDF-Dokumenten,"
                                "das Dokument in vorgegebene Kategorien klassifiziert."
                                "Du gibts möglichst kurze Antworten, am besten ein Wort"
                                "Du gibst keine Erklärungen oder Begründungen. "
                                "Du klassifizierst nur nach den vorgegebenen Kategorien."
                                "Wenn ein Dokument partout nicht klassifizierbar ist, "
                                "antwortest du mit '<no classification>'"},
                  {"role": "user", "content": prompt_text}])
    return response.choices[0].message.content


# nicht aktuell
def ask_gpt4(question):
    print(question)  # we don't have to submit the question?
    try:
        # Use the chat function to send a message and get a response
        response = ChatOpenAI()
        # Extract the response text
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        # Handle exceptions that may occur during the API call
        return str(e)


def process_prompts_and_save(my_prompts):
    # Ensure the responses list is empty initially
    responses = []

    # Loop through each prompt in the list
    for prompt in my_prompts:
        try:
            # ADD LOGIC TO READ FILE AND CLASSIFY
            # Generate response for each prompt and append to the list
            response = ask_gpt4(prompt)
            sol = f"{prompt}\n\n{response}\n\n\n\n"
            print(sol)
            responses.append(sol)
        except Exception as e:
            # In case of an error, log the error with the prompt
            responses.append(f"{prompt}\n\nError:{str(e)}\n\n\n\n")

    # Writing all responses to a text file
    with open('gpt4_responses.txt', 'w', encoding='utf-8') as file:
        file.writelines(responses)


def get_pdfs_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def get_pdf_text(pdf_document):
    text = ""
    pdf_reader = PdfReader(pdf_document)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text


def json_open(filename):
    with open(filename, "r") as f:
        mydata = f.read()
    return mydata


def main():
    st.title("Doc Classifier")
    if st.toggle("show README"):
        st.subheader("Funktion: ")
        st.write("der Doc Classifier von Elia Wäfler kann einige der BIM2FM Dokumente des ASH nach Disziplin, Doc typ. und Geschoss (später KBOB) klassifizieren. lade ein oder mehrere PDF-Dokumente hoch, um es auszuprobieren. Feedback und Bugs gerne an elia.waefler@insel.ch")
        st.write("Vielen Dank.")
        st.write("")
        st.subheader("Licence and credits")
        st.write("THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.")
        st.write("special thanks to OpenAI, Huggingface, Streamlit")
        l, r = st.columns(2)
        with l:
            st.subheader("Limitationen: ")
            st.write("bisher nur PDFs")
            st.write("nur Disziplin, Doc typ. und Geschoss")
            st.write("macht teilweise Fehler, vor allem bei Koordination, Datennetz usw, (unklare Disziplinen)")
            st.write("")
        with r:
            st.subheader("geplante Erweiterungen:")
            st.write("Text Beschreibung wird von AI hinzugefügt")
            st.write("jpg, bilder, tabellen, .xlsx, .docx alles möglich, nicht nur PDF/Text")
            st.write("Ecodomus API einbinden, um alle Dokumente zu überprüfen.")
    if "login" not in st.session_state:
        st.session_state.login = False

    if st.session_state.login:
        uploaded_files = st.file_uploader("PDF Dokument", accept_multiple_files=True)

        # print(uploaded_file)
        # print(uploaded_file.name)
        if st.button("classify KBOB!"):
            if uploaded_files is not None:
                with st.container():
                    # col1, col2, col3, col4, col5 = st.columns(5)
                    col1, col2, col3 = st.columns(3)
                    all_metadata = []
                    with col1:
                        st.write("Disziplin")
                        st.write(f"")
                    with col2:
                        st.write("Dokumententyp")
                        st.write(f"")
                    with col3:
                        st.write("Geschoss")
                        st.write(f"")
                    for file in uploaded_files:
                        metadata = []
                        metadata.append(str(file.name))
                        with col1:
                            with st.spinner("GPT4 at work"):
                                pdf_text = str(get_pdf_text(file))
                                prompt_1 = auftrag_0 + auftrag_1_disziplin + str(Baubranchen_Disziplinen) + pdf_text
                                try:
                                    answer_1 = gpt4_new(prompt_1)
                                except:
                                    answer_1 = "<err_no_classification>"
                                print(prompt_1)
                                metadata.append(str(answer_1))
                            st.write(answer_1)
                        with col2:
                            with st.spinner("GPT4 at work"):
                                prompt_2 = auftrag_0 + auftrag_1_type + str(Dokumententypen) + pdf_text
                                try:
                                    answer_2 = gpt4_new(prompt_2)
                                except:
                                    answer_2 = "<err_no_classification>"
                                print(prompt_2)
                                metadata.append(str(answer_2))

                            st.write(answer_2)
                        with col3:
                            with st.spinner("GPT4 at work"):
                                prompt_3 = auftrag_0 + auftrag_1_ge + str(ASH_Geschosse) + pdf_text
                                try:
                                    answer_3 = gpt4_new(prompt_3)
                                except:
                                    answer_3 = "<err_no_classification>"
                                print(prompt_3)
                                metadata.append(str(answer_3))

                            st.write(answer_3)
                        all_metadata.append(metadata)

                    metadata_filename = "ai_generated_metadata.txt"
                    with open(metadata_filename, 'w', encoding='utf-8') as f:
                        for line in all_metadata:
                            f.writelines("\n")
                            for item in line:
                                f.writelines(item)
                                f.writelines(";")

                            f.writelines("\n")

                    st.success("classified, saved")
                    st.download_button(f"Download Metadata", json_open(metadata_filename), file_name=metadata_filename)
            else:
                st.warning("no file")

    else:
        user_pw = st.text_input("ASK_ASH_PASSWORD: ", type="password")
        if st.button("check"):
            time.sleep(0.5)
            if user_pw == ASK_ASH_PASSWORD:
                st.session_state.login = True
                st.rerun()


if __name__ == "__main__":
    #prompts = ["classify the document, tell me the ", "hello"]
    #process_prompts_and_save(prompts)
    auftrag_0 = "Klassifiziere dieses Dokument nach "
    auftrag_1_disziplin = "diesen 'Baubranchen Disziplinen': "
    auftrag_1_type = "diesen 'Dokumententypen': "
    auftrag_1_ge = "diesen 'Geschossen': "
    Baubranchen_Disziplinen = ['A-Architektur', 'B-Bauphysik', 'C-Rohrpostanlagen', 'D-Datennetz', 'E-Elektroanlagen',
                               'F-Fassadenplanung', 'G-Küche', 'H-Heizung', 'I-Innenausbau', 'K-Kälte', 'L-Lüftung',
                               'M-Medizintechnik', 'N-Fördertechnik', 'O-Gebäudebetrieb', 'P-Sprinkler',
                               'Q-Brandschutz', 'R-Koordination', 'S-Sanitär', 'T-Tragwerksplanung', 'W-Informatik',
                               'Z-Lichtplanung']
    auftrag_2 = "gib nur den am besten passendsten Eintrag zurück. " \
                "Keine weiteren Ausführungen oder Erklärungen. " \
                "Antworte am besten in einem Wort. " \
                "Hier der Dokumenteninhalt: "
    Dokumententypen = ['Fotodokumentation', 'Projektdokumentation (PD)', 'Objektdokumentation (OD)',
                       'Prozessdokumentation',  'Fachdokumentation', 'Anlagedokumentation']
    ASH_Geschosse = ['U4', 'U3', 'U2', 'U1',
                     'A', 'B', 'C', 'D', 'E', 'F', 'G']
    #print(str(Baubranchen_Disziplinen))
    main()