reyemhorts commited on
Commit
a2e10cf
·
1 Parent(s): 4562ed6

add oai and modules

Browse files
Files changed (4) hide show
  1. README.md +0 -3
  2. all_unique_modules.json +0 -0
  3. app.py +118 -0
  4. oai.py +54 -0
README.md CHANGED
@@ -8,9 +8,6 @@ sdk_version: 3.40.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
-
12
- hf_oauth: true
13
- hf_oauth_redirect_path: /custom_callback_route # optional, see "Redirect URLs" below
14
  ---
15
 
16
 
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
 
 
11
  ---
12
 
13
 
all_unique_modules.json ADDED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -5,6 +5,7 @@ from typing import Optional
5
  import json
6
 
7
  from load_db import load_vectorestore_from_pdf
 
8
 
9
 
10
  TEMP_PDF_PATH = "temp.pdf"
@@ -12,6 +13,18 @@ retriever = None
12
  db = None
13
  documents = None
14
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def pdf_to_text(file_path:str, page_num:Optional[int]=None):
16
  reader = PdfReader(file_path)
17
  if page_num:
@@ -37,6 +50,102 @@ def load_pdf(inp):
37
  #print(text)
38
  return text
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  with gr.Blocks() as app:
42
  file = gr.File(type="binary")
@@ -47,6 +156,15 @@ with gr.Blocks() as app:
47
  load_file_button.click(load_pdf,inputs=file,outputs=handbook)
48
 
49
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
  if __name__ == "__main__":
 
5
  import json
6
 
7
  from load_db import load_vectorestore_from_pdf
8
+ from oai import chat_prompt, get_comparison
9
 
10
 
11
  TEMP_PDF_PATH = "temp.pdf"
 
13
  db = None
14
  documents = None
15
 
16
+ with open("all_unique_modules.json","r",encoding="UTF-8") as f:
17
+ courses = json.load(f)
18
+
19
+ def get_course(code:str)->str|None:
20
+ #print(len(courses))
21
+ for c in courses:
22
+ if not "Kurscode" in c:
23
+ continue
24
+ if c["Kurscode"].strip()==code.strip():
25
+ return f"Kursbeschreibung:\n{c['Description']}\nKursziele:\n{c['Kursziele']}\nKursinhalte:\n{c['Kursinhalt']}"
26
+ return None
27
+
28
  def pdf_to_text(file_path:str, page_num:Optional[int]=None):
29
  reader = PdfReader(file_path)
30
  if page_num:
 
50
  #print(text)
51
  return text
52
 
53
+ def compare_with_gpt(iu_code:str,text:str, model_name:Optional[str]=None)->str|None:
54
+ iu_course = get_course(iu_code)
55
+ if iu_course == None:
56
+ print("IU Course not found")
57
+ return None
58
+ if model_name != None:
59
+ return get_comparison(iu_course,text,model_name=model_name)
60
+ return get_comparison(iu_course,text,model_name="gpt-4")
61
+
62
+ def get_relevant_docs(search:str, iu_course:str):
63
+ global db, retriever, documents
64
+ documents = []
65
+ final_res = ""
66
+ documents = retriever.get_relevant_documents(f"Wo ist Modul/Kurs: {search.strip()}")
67
+ for document_ in documents:
68
+ print(f'\n>{document_.metadata["source"]} (Page {document_.metadata["page"]})')
69
+ #print(">",document.page_content,"\n\n")
70
+ final_res = f'{final_res}\n>{document_.metadata["source"]} (Page {document_.metadata["page"]}) :'
71
+ get_relevant_docs_promt = f"""
72
+
73
+ Du wirst 4 verschiedene Objecte in JSON erhalten und musst herausfinden welches ich suche.
74
+ Es handelt sich dabei um Ausschnitte eines Modulhandbuches einer Hochschule.
75
+ Ich suche die Modulbeschreibung von '{search}'. Suche danach in dem property 'page_content'
76
+ Nenne die Page in den Metadaten des richtigen Objektes. Denk dir keinen Quatsch aus, wenn du die Modulbeschreibung nicht findest, sag es.
77
+ Antworte in JSON format und fülle die werte page_found (bool) und page (int|None) aus.
78
+ """.strip()
79
+ messages = [
80
+ {
81
+ "role": "system", "content": f"{get_relevant_docs_promt}"
82
+ },
83
+ {
84
+ "role": "system", "content": f"Das sind die 4 Dokumente:\n{documents}"
85
+ }
86
+ ]
87
+ res = chat_prompt(messages=messages,model_name="gpt-4") #gpt-3.5-turbo-16k doesnt understand the task
88
+
89
+ final_res = f'{final_res}\n\n{res}\n'
90
+ res = json.loads(res)
91
+ if res["page_found"] ==False:
92
+ return final_res
93
+ page_num = int(res["page"])
94
+
95
+ final_res = f'{final_res}\n\nDocument auf Seite >{page_num} identifiziert.\nPrüfe auf Vollständigkeit...\n'
96
+
97
+ for doc in documents:
98
+ if page_num == doc.metadata["page"]:
99
+ print(page_num,"==",doc.metadata["page"])
100
+ document = doc
101
+ break
102
+ next_page = documents = db.get(where={"page":page_num+1}) #pdf_to_text("temp.pdf",page_num=page_num+1)
103
+ if len(next_page["documents"]) > 0:
104
+ next_page = next_page["documents"][0]
105
+ words = next_page.split()
106
+ first_40_words = words[:40]
107
+ first_40_words_str = ' '.join(first_40_words)
108
+
109
+ # get the 50 last words of the description
110
+ words = document.page_content.split()
111
+ last_50_words = words[-50:]
112
+ # Join the words back into a string
113
+ last_50_words_str = ' '.join(last_50_words)
114
+
115
+ check_doc_complete_prompt= f"""
116
+ Du wirst einen Auszug einer Modulbeschreibung eines Modules von einer Hochschule erhalten und sollst herausfinden, ob diese vollständig ist oder ob eventuell die nächste
117
+ Seite auch noch Inhalte zum Modul hat. Solltest du eine Auflistung der Kompetenzen und Inhalte finden, ist die Modulbeschreibung vollständig. Wenn auf der Folgeseite nur noch Literaturangaben sind, ist die Modulbeschreibung bereits vollständig.
118
+ Wenn der Auszug mit den Literaturangaben endet, ist die Modulbeschreibung vollständig.
119
+ Antworte nur mit 'vollständig' und 'unvollständig'.
120
+ Das Modul heißt [{search}]. Ist die Beschreibung vollständig oder fehlt etwas? Hier ist der Text: \n[{last_50_words_str}].
121
+ Das ist hier ist der Text der nächsten Seite:\n[{first_40_words_str}]
122
+ """.strip()
123
+
124
+ messages = [
125
+ {
126
+ "role": "system", "content": f"{check_doc_complete_prompt}"
127
+ }
128
+ ]
129
+
130
+ res = chat_prompt(messages=messages,model_name="gpt-3.5-turbo-16k") #gpt-3.5-turbo-16k
131
+ #print("\nDie Beschreibung ist: ",res,"\n")
132
+ final_res = f'{final_res}\nDie Beschreibung ist: {res}\n'
133
+
134
+ description = f"{document.page_content}"
135
+
136
+ if res == "unvollständig":
137
+ description = f"{description}\n{next_page}"
138
+ else:
139
+
140
+ description = f"{document.page_content}"
141
+ final_res = f'{final_res}\nDie Beschreibung ist: {description}\n'
142
+
143
+ final_res = f'{final_res}\nSuche nach: {search}\nBeschreibung: {description}\n\n'
144
+ res = compare_with_gpt(iu_course,description,model_name="gpt-3.5-turbo-16k")
145
+
146
+ final_res = f'{final_res}\nIU-Kurs:{iu_course}\nErgebnis: {res}\n'
147
+
148
+ return final_res
149
 
150
  with gr.Blocks() as app:
151
  file = gr.File(type="binary")
 
156
  load_file_button.click(load_pdf,inputs=file,outputs=handbook)
157
 
158
 
159
+ with gr.Accordion("Kurse finden",open=False):
160
+ with gr.Row():
161
+ course_query = gr.Textbox(label="Kursnamen der zu prüfenden Kurse")
162
+ iu_ccode = gr.Textbox(label="IU Kurscode")
163
+ course_description = gr.TextArea(label="Ergebnis der Prüfung")
164
+
165
+
166
+ course_query.submit(get_relevant_docs,inputs=[course_query,iu_ccode],outputs=course_description)
167
+
168
 
169
 
170
  if __name__ == "__main__":
oai.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ openai.api_key = os.getenv("OPENAI_API_KEY")
8
+
9
+ def get_comparison(iu_course,foreign_course,model_name:str="gpt-3.5-turbo")->str|None:
10
+ try:
11
+ completion = openai.ChatCompletion.create(
12
+ model=model_name,
13
+ messages=[
14
+ {"role": "system", "content": f"""
15
+ You are comparing one educational module between IU - International University of Applied Sciences and one module of another university for equivalency. The module of the other university may include several submodules, all submodules count as one.
16
+
17
+ User will provide a description of the module of another university and your job is to figure out, if this module is equivalent to the IU module.
18
+
19
+ List which key competencies in the IU module are not present in the other module.
20
+
21
+ List which content in the IU module is not present in the other module.
22
+
23
+ Find the most important key competencies in the IU Module and list them, judge if they are present in the other module (if present and if the other module contains submodules, name the submodule).
24
+
25
+ Find the most important key competencies in the other module and list them, judge if they are present in the IU module (if present and if the other module contains submodules, name the submodule).
26
+
27
+ Finally, estimate how much of the modules are equivalent in percent considering the key competencies and content that is present in IU Module but not in the other (the more present the more equivalent). Make a judge if the compared modules are equivalent, keep in mind that same topics might be called different.
28
+
29
+ Answer only in german language.
30
+
31
+ This is the module from IU - International University of Applied Sciences: \n{iu_course}
32
+ """},
33
+ {"role": "user", "content": f"{foreign_course}"}
34
+ ],
35
+ temperature=0,
36
+ top_p=0.9
37
+ )
38
+ return completion.choices[0].message["content"]
39
+ except Exception as e:
40
+ print(e)
41
+ return None
42
+
43
+ def chat_prompt(messages,temperature:float=0.0,top_p:float=0.9,model_name:str="gpt-3.5-turbo")->str|None:
44
+ try:
45
+ completion = openai.ChatCompletion.create(
46
+ model=model_name,
47
+ messages=messages,
48
+ temperature=temperature,
49
+ top_p=top_p
50
+ )
51
+ return completion.choices[0].message["content"]
52
+ except Exception as e:
53
+ print(e)
54
+ return None