Spaces:
Build error
Build error
Commit
·
a2e10cf
1
Parent(s):
4562ed6
add oai and modules
Browse files- README.md +0 -3
- all_unique_modules.json +0 -0
- app.py +118 -0
- oai.py +54 -0
README.md
CHANGED
|
@@ -8,9 +8,6 @@ sdk_version: 3.40.1
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
| 11 |
-
|
| 12 |
-
hf_oauth: true
|
| 13 |
-
hf_oauth_redirect_path: /custom_callback_route # optional, see "Redirect URLs" below
|
| 14 |
---
|
| 15 |
|
| 16 |
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
|
all_unique_modules.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import Optional
|
|
| 5 |
import json
|
| 6 |
|
| 7 |
from load_db import load_vectorestore_from_pdf
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
TEMP_PDF_PATH = "temp.pdf"
|
|
@@ -12,6 +13,18 @@ retriever = None
|
|
| 12 |
db = None
|
| 13 |
documents = None
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def pdf_to_text(file_path:str, page_num:Optional[int]=None):
|
| 16 |
reader = PdfReader(file_path)
|
| 17 |
if page_num:
|
|
@@ -37,6 +50,102 @@ def load_pdf(inp):
|
|
| 37 |
#print(text)
|
| 38 |
return text
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
with gr.Blocks() as app:
|
| 42 |
file = gr.File(type="binary")
|
|
@@ -47,6 +156,15 @@ with gr.Blocks() as app:
|
|
| 47 |
load_file_button.click(load_pdf,inputs=file,outputs=handbook)
|
| 48 |
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
if __name__ == "__main__":
|
|
|
|
| 5 |
import json
|
| 6 |
|
| 7 |
from load_db import load_vectorestore_from_pdf
|
| 8 |
+
from oai import chat_prompt, get_comparison
|
| 9 |
|
| 10 |
|
| 11 |
TEMP_PDF_PATH = "temp.pdf"
|
|
|
|
| 13 |
db = None
|
| 14 |
documents = None
|
| 15 |
|
| 16 |
+
with open("all_unique_modules.json","r",encoding="UTF-8") as f:
|
| 17 |
+
courses = json.load(f)
|
| 18 |
+
|
| 19 |
+
def get_course(code:str)->str|None:
|
| 20 |
+
#print(len(courses))
|
| 21 |
+
for c in courses:
|
| 22 |
+
if not "Kurscode" in c:
|
| 23 |
+
continue
|
| 24 |
+
if c["Kurscode"].strip()==code.strip():
|
| 25 |
+
return f"Kursbeschreibung:\n{c['Description']}\nKursziele:\n{c['Kursziele']}\nKursinhalte:\n{c['Kursinhalt']}"
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
def pdf_to_text(file_path:str, page_num:Optional[int]=None):
|
| 29 |
reader = PdfReader(file_path)
|
| 30 |
if page_num:
|
|
|
|
| 50 |
#print(text)
|
| 51 |
return text
|
| 52 |
|
| 53 |
+
def compare_with_gpt(iu_code:str,text:str, model_name:Optional[str]=None)->str|None:
|
| 54 |
+
iu_course = get_course(iu_code)
|
| 55 |
+
if iu_course == None:
|
| 56 |
+
print("IU Course not found")
|
| 57 |
+
return None
|
| 58 |
+
if model_name != None:
|
| 59 |
+
return get_comparison(iu_course,text,model_name=model_name)
|
| 60 |
+
return get_comparison(iu_course,text,model_name="gpt-4")
|
| 61 |
+
|
| 62 |
+
def get_relevant_docs(search:str, iu_course:str):
|
| 63 |
+
global db, retriever, documents
|
| 64 |
+
documents = []
|
| 65 |
+
final_res = ""
|
| 66 |
+
documents = retriever.get_relevant_documents(f"Wo ist Modul/Kurs: {search.strip()}")
|
| 67 |
+
for document_ in documents:
|
| 68 |
+
print(f'\n>{document_.metadata["source"]} (Page {document_.metadata["page"]})')
|
| 69 |
+
#print(">",document.page_content,"\n\n")
|
| 70 |
+
final_res = f'{final_res}\n>{document_.metadata["source"]} (Page {document_.metadata["page"]}) :'
|
| 71 |
+
get_relevant_docs_promt = f"""
|
| 72 |
+
|
| 73 |
+
Du wirst 4 verschiedene Objecte in JSON erhalten und musst herausfinden welches ich suche.
|
| 74 |
+
Es handelt sich dabei um Ausschnitte eines Modulhandbuches einer Hochschule.
|
| 75 |
+
Ich suche die Modulbeschreibung von '{search}'. Suche danach in dem property 'page_content'
|
| 76 |
+
Nenne die Page in den Metadaten des richtigen Objektes. Denk dir keinen Quatsch aus, wenn du die Modulbeschreibung nicht findest, sag es.
|
| 77 |
+
Antworte in JSON format und fülle die werte page_found (bool) und page (int|None) aus.
|
| 78 |
+
""".strip()
|
| 79 |
+
messages = [
|
| 80 |
+
{
|
| 81 |
+
"role": "system", "content": f"{get_relevant_docs_promt}"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"role": "system", "content": f"Das sind die 4 Dokumente:\n{documents}"
|
| 85 |
+
}
|
| 86 |
+
]
|
| 87 |
+
res = chat_prompt(messages=messages,model_name="gpt-4") #gpt-3.5-turbo-16k doesnt understand the task
|
| 88 |
+
|
| 89 |
+
final_res = f'{final_res}\n\n{res}\n'
|
| 90 |
+
res = json.loads(res)
|
| 91 |
+
if res["page_found"] ==False:
|
| 92 |
+
return final_res
|
| 93 |
+
page_num = int(res["page"])
|
| 94 |
+
|
| 95 |
+
final_res = f'{final_res}\n\nDocument auf Seite >{page_num} identifiziert.\nPrüfe auf Vollständigkeit...\n'
|
| 96 |
+
|
| 97 |
+
for doc in documents:
|
| 98 |
+
if page_num == doc.metadata["page"]:
|
| 99 |
+
print(page_num,"==",doc.metadata["page"])
|
| 100 |
+
document = doc
|
| 101 |
+
break
|
| 102 |
+
next_page = documents = db.get(where={"page":page_num+1}) #pdf_to_text("temp.pdf",page_num=page_num+1)
|
| 103 |
+
if len(next_page["documents"]) > 0:
|
| 104 |
+
next_page = next_page["documents"][0]
|
| 105 |
+
words = next_page.split()
|
| 106 |
+
first_40_words = words[:40]
|
| 107 |
+
first_40_words_str = ' '.join(first_40_words)
|
| 108 |
+
|
| 109 |
+
# get the 50 last words of the description
|
| 110 |
+
words = document.page_content.split()
|
| 111 |
+
last_50_words = words[-50:]
|
| 112 |
+
# Join the words back into a string
|
| 113 |
+
last_50_words_str = ' '.join(last_50_words)
|
| 114 |
+
|
| 115 |
+
check_doc_complete_prompt= f"""
|
| 116 |
+
Du wirst einen Auszug einer Modulbeschreibung eines Modules von einer Hochschule erhalten und sollst herausfinden, ob diese vollständig ist oder ob eventuell die nächste
|
| 117 |
+
Seite auch noch Inhalte zum Modul hat. Solltest du eine Auflistung der Kompetenzen und Inhalte finden, ist die Modulbeschreibung vollständig. Wenn auf der Folgeseite nur noch Literaturangaben sind, ist die Modulbeschreibung bereits vollständig.
|
| 118 |
+
Wenn der Auszug mit den Literaturangaben endet, ist die Modulbeschreibung vollständig.
|
| 119 |
+
Antworte nur mit 'vollständig' und 'unvollständig'.
|
| 120 |
+
Das Modul heißt [{search}]. Ist die Beschreibung vollständig oder fehlt etwas? Hier ist der Text: \n[{last_50_words_str}].
|
| 121 |
+
Das ist hier ist der Text der nächsten Seite:\n[{first_40_words_str}]
|
| 122 |
+
""".strip()
|
| 123 |
+
|
| 124 |
+
messages = [
|
| 125 |
+
{
|
| 126 |
+
"role": "system", "content": f"{check_doc_complete_prompt}"
|
| 127 |
+
}
|
| 128 |
+
]
|
| 129 |
+
|
| 130 |
+
res = chat_prompt(messages=messages,model_name="gpt-3.5-turbo-16k") #gpt-3.5-turbo-16k
|
| 131 |
+
#print("\nDie Beschreibung ist: ",res,"\n")
|
| 132 |
+
final_res = f'{final_res}\nDie Beschreibung ist: {res}\n'
|
| 133 |
+
|
| 134 |
+
description = f"{document.page_content}"
|
| 135 |
+
|
| 136 |
+
if res == "unvollständig":
|
| 137 |
+
description = f"{description}\n{next_page}"
|
| 138 |
+
else:
|
| 139 |
+
|
| 140 |
+
description = f"{document.page_content}"
|
| 141 |
+
final_res = f'{final_res}\nDie Beschreibung ist: {description}\n'
|
| 142 |
+
|
| 143 |
+
final_res = f'{final_res}\nSuche nach: {search}\nBeschreibung: {description}\n\n'
|
| 144 |
+
res = compare_with_gpt(iu_course,description,model_name="gpt-3.5-turbo-16k")
|
| 145 |
+
|
| 146 |
+
final_res = f'{final_res}\nIU-Kurs:{iu_course}\nErgebnis: {res}\n'
|
| 147 |
+
|
| 148 |
+
return final_res
|
| 149 |
|
| 150 |
with gr.Blocks() as app:
|
| 151 |
file = gr.File(type="binary")
|
|
|
|
| 156 |
load_file_button.click(load_pdf,inputs=file,outputs=handbook)
|
| 157 |
|
| 158 |
|
| 159 |
+
with gr.Accordion("Kurse finden",open=False):
|
| 160 |
+
with gr.Row():
|
| 161 |
+
course_query = gr.Textbox(label="Kursnamen der zu prüfenden Kurse")
|
| 162 |
+
iu_ccode = gr.Textbox(label="IU Kurscode")
|
| 163 |
+
course_description = gr.TextArea(label="Ergebnis der Prüfung")
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
course_query.submit(get_relevant_docs,inputs=[course_query,iu_ccode],outputs=course_description)
|
| 167 |
+
|
| 168 |
|
| 169 |
|
| 170 |
if __name__ == "__main__":
|
oai.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import openai
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 8 |
+
|
| 9 |
+
def get_comparison(iu_course,foreign_course,model_name:str="gpt-3.5-turbo")->str|None:
|
| 10 |
+
try:
|
| 11 |
+
completion = openai.ChatCompletion.create(
|
| 12 |
+
model=model_name,
|
| 13 |
+
messages=[
|
| 14 |
+
{"role": "system", "content": f"""
|
| 15 |
+
You are comparing one educational module between IU - International University of Applied Sciences and one module of another university for equivalency. The module of the other university may include several submodules, all submodules count as one.
|
| 16 |
+
|
| 17 |
+
User will provide a description of the module of another university and your job is to figure out, if this module is equivalent to the IU module.
|
| 18 |
+
|
| 19 |
+
List which key competencies in the IU module are not present in the other module.
|
| 20 |
+
|
| 21 |
+
List which content in the IU module is not present in the other module.
|
| 22 |
+
|
| 23 |
+
Find the most important key competencies in the IU Module and list them, judge if they are present in the other module (if present and if the other module contains submodules, name the submodule).
|
| 24 |
+
|
| 25 |
+
Find the most important key competencies in the other module and list them, judge if they are present in the IU module (if present and if the other module contains submodules, name the submodule).
|
| 26 |
+
|
| 27 |
+
Finally, estimate how much of the modules are equivalent in percent considering the key competencies and content that is present in IU Module but not in the other (the more present the more equivalent). Make a judge if the compared modules are equivalent, keep in mind that same topics might be called different.
|
| 28 |
+
|
| 29 |
+
Answer only in german language.
|
| 30 |
+
|
| 31 |
+
This is the module from IU - International University of Applied Sciences: \n{iu_course}
|
| 32 |
+
"""},
|
| 33 |
+
{"role": "user", "content": f"{foreign_course}"}
|
| 34 |
+
],
|
| 35 |
+
temperature=0,
|
| 36 |
+
top_p=0.9
|
| 37 |
+
)
|
| 38 |
+
return completion.choices[0].message["content"]
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(e)
|
| 41 |
+
return None
|
| 42 |
+
|
| 43 |
+
def chat_prompt(messages,temperature:float=0.0,top_p:float=0.9,model_name:str="gpt-3.5-turbo")->str|None:
|
| 44 |
+
try:
|
| 45 |
+
completion = openai.ChatCompletion.create(
|
| 46 |
+
model=model_name,
|
| 47 |
+
messages=messages,
|
| 48 |
+
temperature=temperature,
|
| 49 |
+
top_p=top_p
|
| 50 |
+
)
|
| 51 |
+
return completion.choices[0].message["content"]
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(e)
|
| 54 |
+
return None
|