Spaces:
No application file
No application file
Commit
·
8b9aa2c
1
Parent(s):
661b653
init files, idea
Browse files- .idea/.name +1 -0
- my_1_openai.py +59 -0
- my_1_reader.py +201 -0
- my_1_writer.py +98 -0
- my_2_embedder.py +169 -0
- my_2_sim_search.py +109 -0
.idea/.name
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
app.py
|
my_1_openai.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from openai import OpenAI
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
openai.api_key = os.environ.get("OPENAI_API_KEY")
|
| 6 |
+
openai.organization = os.environ.get("OPENAI_ORG_ID")
|
| 7 |
+
models = {
|
| 8 |
+
"assistant": "You are a helpful assistant.",
|
| 9 |
+
"binary": "you are a maschine that converts questions or prompts to binary outputs. "
|
| 10 |
+
"you can only answer 'yes' or 'no'. if uncertain, default to 'no'."
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def gpt4_new(prompt_text):
|
| 15 |
+
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
|
| 16 |
+
response = client.chat.completions.create(
|
| 17 |
+
model="gpt-4",
|
| 18 |
+
messages=[{"role": "system",
|
| 19 |
+
"content": "Du bist eine Maschine, die Dokumente klassifiziert."},
|
| 20 |
+
{"role": "user", "content": prompt_text}])
|
| 21 |
+
return response.choices[0].message.content
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def gpt4(prompt, model=models["assistant"]):
|
| 26 |
+
response = openai.Completion.create(
|
| 27 |
+
model="gpt-4",
|
| 28 |
+
messages=[
|
| 29 |
+
{"role": "system", "content": model},
|
| 30 |
+
{"role": "user", "content": prompt}
|
| 31 |
+
]
|
| 32 |
+
)
|
| 33 |
+
return response.choices[0].message['content']
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def gpt_bool(prompt):
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
:param prompt: the text prompt
|
| 40 |
+
:return: True or False
|
| 41 |
+
"""
|
| 42 |
+
true_values = ["yes", "Yes", "Y", "y", "yes.", "Yes.", "YES"]
|
| 43 |
+
return bool(gpt4(prompt, model=models["binary"]) in true_values)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def vectorize_data(data_input):
|
| 47 |
+
try:
|
| 48 |
+
response = openai.Embedding.create(input=data_input, model="text-embedding-ada-002")
|
| 49 |
+
except openai.error.InvalidRequestError as err:
|
| 50 |
+
print(err)
|
| 51 |
+
return [0, 0, 0]
|
| 52 |
+
return response['data'][0]['embedding']
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
if __name__ == "__main__":
|
| 56 |
+
print("here are all functions that directly call openai.")
|
| 57 |
+
print("hi, im chatGPT how can I help? ")
|
| 58 |
+
while True:
|
| 59 |
+
print(gpt_bool(input()))
|
my_1_reader.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MUSS AUFGERÄUMT WERDEN
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import subprocess
|
| 6 |
+
import PyPDF2
|
| 7 |
+
import csv
|
| 8 |
+
import fitz # PyMuPDF
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def extract_text_from_pdf(pdf_path):
|
| 12 |
+
"""
|
| 13 |
+
Extracts all text from a PDF file.
|
| 14 |
+
|
| 15 |
+
:param pdf_path: Path to the PDF file.
|
| 16 |
+
:return: Extracted text as a string.
|
| 17 |
+
"""
|
| 18 |
+
# Open the PDF file
|
| 19 |
+
doc = fitz.open(pdf_path)
|
| 20 |
+
|
| 21 |
+
# Initialize an empty string to hold the text
|
| 22 |
+
text = ''
|
| 23 |
+
|
| 24 |
+
# Iterate through each page in the PDF
|
| 25 |
+
for page_num in range(len(doc)):
|
| 26 |
+
# Get a page
|
| 27 |
+
page = doc.load_page(page_num)
|
| 28 |
+
|
| 29 |
+
# Extract text from the page and add it to the result
|
| 30 |
+
text += page.get_text()
|
| 31 |
+
|
| 32 |
+
# Close the document
|
| 33 |
+
doc.close()
|
| 34 |
+
|
| 35 |
+
return text
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def read_pdfs_from_folder(folder_path):
|
| 39 |
+
"""
|
| 40 |
+
Reads all PDF files in the specified folder using PdfReader and extracts their text.
|
| 41 |
+
|
| 42 |
+
Parameters:
|
| 43 |
+
- folder_path: The path to the folder containing PDF files.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
- A dictionary with file names as keys and their extracted text as values.
|
| 47 |
+
"""
|
| 48 |
+
pdf_texts = {}
|
| 49 |
+
for filename in os.listdir(folder_path):
|
| 50 |
+
if filename.endswith('.pdf'):
|
| 51 |
+
file_path = os.path.join(folder_path, filename)
|
| 52 |
+
with open(file_path, 'rb') as pdf_file:
|
| 53 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 54 |
+
text = ''
|
| 55 |
+
for page in pdf_reader.pages:
|
| 56 |
+
try:
|
| 57 |
+
text += page.extract_text()
|
| 58 |
+
except UnicodeDecodeError as e:
|
| 59 |
+
print(e)
|
| 60 |
+
for c in text:
|
| 61 |
+
if c in ["ä", "Ä"]:
|
| 62 |
+
text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
|
| 63 |
+
if c in ["ö", "Ö"]:
|
| 64 |
+
text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
|
| 65 |
+
if c in ["ü", "Ü"]:
|
| 66 |
+
text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
|
| 67 |
+
if c in [",", ";", "\\", '"']:
|
| 68 |
+
text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
|
| 69 |
+
if c in ["/n", "\n"]:
|
| 70 |
+
text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
|
| 71 |
+
pdf_texts[filename] = text
|
| 72 |
+
return pdf_texts
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def read_csv_lines_as_strings(filename):
|
| 76 |
+
"""
|
| 77 |
+
Opens a CSV file and returns each line as a string in a list.
|
| 78 |
+
|
| 79 |
+
Parameters:
|
| 80 |
+
- filename: The path to the CSV file.
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
- A list of strings, each representing a line from the CSV file.
|
| 84 |
+
"""
|
| 85 |
+
lines_as_strings = []
|
| 86 |
+
with open(filename, newline='') as csvfile:
|
| 87 |
+
try:
|
| 88 |
+
reader = csv.reader(csvfile)
|
| 89 |
+
for row in reader:
|
| 90 |
+
# Convert the row (a list of values) back into a comma-separated string
|
| 91 |
+
line_as_string = ','.join(row)
|
| 92 |
+
lines_as_strings.append(line_as_string)
|
| 93 |
+
except UnicodeDecodeError as e:
|
| 94 |
+
print(e)
|
| 95 |
+
return lines_as_strings
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# Function to load data from JSON files
|
| 99 |
+
def load_data(filename):
|
| 100 |
+
with open(filename, 'r') as file:
|
| 101 |
+
try:
|
| 102 |
+
return json.load(file)
|
| 103 |
+
except UnicodeDecodeError as err:
|
| 104 |
+
print(err)
|
| 105 |
+
return {}
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def find_and_open_file(filename, start_directory):
|
| 109 |
+
"""
|
| 110 |
+
Attempts to open a file with the given filename starting from the specified directory.
|
| 111 |
+
If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
|
| 112 |
+
"""
|
| 113 |
+
for root, dirs, files in os.walk(start_directory):
|
| 114 |
+
if filename in files:
|
| 115 |
+
filepath = os.path.join(root, filename)
|
| 116 |
+
print(f"File found: {filepath}")
|
| 117 |
+
return filepath
|
| 118 |
+
print(f"File {filename} not found.")
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def open_file(filepath):
|
| 123 |
+
"""
|
| 124 |
+
Opens the file with the default application, based on the operating system.
|
| 125 |
+
"""
|
| 126 |
+
if os.path.exists(filepath):
|
| 127 |
+
if os.name == 'posix': # Linux, macOS, etc.
|
| 128 |
+
subprocess.call(('open', filepath))
|
| 129 |
+
elif os.name == 'nt': # Windows
|
| 130 |
+
os.startfile(filepath)
|
| 131 |
+
else:
|
| 132 |
+
print(f"Cannot open file on this operating system: {filepath}")
|
| 133 |
+
else:
|
| 134 |
+
print(f"File does not exist: {filepath}")
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def list_folders_files_recursive(path, depth=0):
|
| 138 |
+
"""
|
| 139 |
+
Recursively lists all folders and files within the specified path, including subfolders.
|
| 140 |
+
|
| 141 |
+
Parameters:
|
| 142 |
+
- path: The directory path to list contents from.
|
| 143 |
+
- depth: The current depth of recursion (used for indentation in print statements).
|
| 144 |
+
|
| 145 |
+
Returns:
|
| 146 |
+
- None
|
| 147 |
+
"""
|
| 148 |
+
# Ensure the provided path is a directory
|
| 149 |
+
if not os.path.isdir(path):
|
| 150 |
+
print(f"The provided path '{path}' is not a valid directory.")
|
| 151 |
+
return
|
| 152 |
+
|
| 153 |
+
indent = ' ' * depth # Indentation based on recursion depth
|
| 154 |
+
folders, files = [], []
|
| 155 |
+
|
| 156 |
+
# List all entries in the directory
|
| 157 |
+
for entry in os.listdir(path):
|
| 158 |
+
full_path = os.path.join(path, entry)
|
| 159 |
+
if os.path.isdir(full_path):
|
| 160 |
+
folders.append(entry)
|
| 161 |
+
print(f"{indent}Folder: {entry}")
|
| 162 |
+
# Recursively list subfolders and files
|
| 163 |
+
list_folders_files_recursive(full_path, depth + 1)
|
| 164 |
+
elif os.path.isfile(full_path):
|
| 165 |
+
files.append(entry)
|
| 166 |
+
|
| 167 |
+
for f in files:
|
| 168 |
+
print(f"{indent}File: {f}")
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def list_folders_files(path):
|
| 172 |
+
"""
|
| 173 |
+
Lists all folders and files within the specified path.
|
| 174 |
+
|
| 175 |
+
Parameters:
|
| 176 |
+
- path: The directory path to list contents from.
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
- A tuple of two lists: (folders, files).
|
| 180 |
+
"""
|
| 181 |
+
folders = []
|
| 182 |
+
files = []
|
| 183 |
+
|
| 184 |
+
# Ensure the provided path is a directory
|
| 185 |
+
if not os.path.isdir(path):
|
| 186 |
+
print(f"The provided path '{path}' is not a valid directory.")
|
| 187 |
+
return folders, files
|
| 188 |
+
|
| 189 |
+
# List all entries in the directory
|
| 190 |
+
for entry in os.listdir(path):
|
| 191 |
+
full_path = os.path.join(path, entry)
|
| 192 |
+
if os.path.isdir(full_path):
|
| 193 |
+
folders.append(entry)
|
| 194 |
+
elif os.path.isfile(full_path):
|
| 195 |
+
files.append(entry)
|
| 196 |
+
|
| 197 |
+
return folders, files
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
if __name__ == "__main__":
|
| 201 |
+
print("here are all functions that read files")
|
my_1_writer.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MUSS AUFGERÄUMT WERDEN
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def split_json_file(input_filepath, lines_per_file=50):
|
| 8 |
+
"""
|
| 9 |
+
Splits a JSON file into multiple files, each containing up to 'lines_per_file' lines.
|
| 10 |
+
|
| 11 |
+
param input_filepath: The path to the input JSON file.
|
| 12 |
+
param lines_per_file: The maximum number of lines per output file.
|
| 13 |
+
"""
|
| 14 |
+
# Counter for file naming
|
| 15 |
+
file_counter = 1
|
| 16 |
+
# Open the input file
|
| 17 |
+
with open(input_filepath, 'r') as input_file:
|
| 18 |
+
# Read the lines from the input file
|
| 19 |
+
lines = input_file.readlines()
|
| 20 |
+
# Iterate through the lines in chunks of 'lines_per_file'
|
| 21 |
+
for i in range(0, len(lines), lines_per_file):
|
| 22 |
+
# Determine the output file name
|
| 23 |
+
output_filename = f'translate_data/english_{file_counter}.json'
|
| 24 |
+
# Write the current chunk to the output file
|
| 25 |
+
with open(output_filename, 'w') as output_file:
|
| 26 |
+
# Grab the current chunk of lines
|
| 27 |
+
chunk = lines[i:i+lines_per_file]
|
| 28 |
+
# Write each line to the output file
|
| 29 |
+
for line in chunk:
|
| 30 |
+
output_file.write(line)
|
| 31 |
+
print(f'Created {output_filename}')
|
| 32 |
+
# Increment the file counter
|
| 33 |
+
file_counter += 1
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def merge_and_save(list1, list2, dict1, dict2, filename='output.csv'):
|
| 37 |
+
"""
|
| 38 |
+
Merges two lists and two dictionaries into a pandas DataFrame according to the specified structure:
|
| 39 |
+
headers: ['list1', 'list2', 'keys dict1', 'vals dict1', 'keys dict2', 'vals dict2']
|
| 40 |
+
and saves it as a CSV file.
|
| 41 |
+
|
| 42 |
+
Parameters:
|
| 43 |
+
- list1 (list): First list to merge, contributing to column 'list1'.
|
| 44 |
+
- list2 (list): Second list to merge, contributing to column 'list2'.
|
| 45 |
+
- dict1 (dict): First dictionary to merge, keys and values added as separate columns.
|
| 46 |
+
- dict2 (dict): Second dictionary to merge, keys and values added as separate columns.
|
| 47 |
+
- filename (str): Filename for the saved CSV file.
|
| 48 |
+
"""
|
| 49 |
+
# Combining all elements into a structured list of dictionaries for DataFrame construction
|
| 50 |
+
data = []
|
| 51 |
+
dict1_items = list(dict1.items())
|
| 52 |
+
dict2_items = list(dict2.items())
|
| 53 |
+
for i in range(len(list1)):
|
| 54 |
+
row = {
|
| 55 |
+
'list1': list1[i],
|
| 56 |
+
'list2': list2[i],
|
| 57 |
+
'keys dict1': dict1_items[i][0],
|
| 58 |
+
'vals dict1': dict1_items[i][1],
|
| 59 |
+
'keys dict2': dict2_items[i][0],
|
| 60 |
+
'vals dict2': dict2_items[i][1]
|
| 61 |
+
}
|
| 62 |
+
data.append(row)
|
| 63 |
+
|
| 64 |
+
# Creating the DataFrame
|
| 65 |
+
df = pd.DataFrame(data)
|
| 66 |
+
|
| 67 |
+
# Saving the DataFrame to a CSV file
|
| 68 |
+
df.to_csv(filename, index=False)
|
| 69 |
+
print(f"DataFrame saved as '{filename}' in the current directory.")
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# new line for every entry
|
| 73 |
+
def safe_my_dict_as_json(file_name, my_dict):
|
| 74 |
+
print(my_dict)
|
| 75 |
+
# Open a file for writing
|
| 76 |
+
with open(file_name, 'w') as f:
|
| 77 |
+
# Write the opening brace of the JSON object
|
| 78 |
+
f.write('{\n')
|
| 79 |
+
# Get total number of items to control comma insertion
|
| 80 |
+
total_items = len(my_dict)
|
| 81 |
+
if type(my_dict) == list:
|
| 82 |
+
my_dict = my_dict[0]
|
| 83 |
+
# Iterate over items, keeping track of the current item index
|
| 84 |
+
for i, (key, value) in enumerate(my_dict.items()):
|
| 85 |
+
# Serialize the key with JSON to handle special characters and ensure proper quoting
|
| 86 |
+
json_key = json.dumps(key)
|
| 87 |
+
# Convert the list to a JSON-formatted string (without indentation)
|
| 88 |
+
json_value = json.dumps(value)
|
| 89 |
+
# Determine if a comma is needed (for all but the last item)
|
| 90 |
+
comma = ',' if i < total_items - 1 else ''
|
| 91 |
+
# Write the formatted string to the file
|
| 92 |
+
f.write(f" {json_key}: {json_value}{comma}\n")
|
| 93 |
+
# Write the closing brace of the JSON object
|
| 94 |
+
f.write('}\n')
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
if __name__ == "__main__":
|
| 98 |
+
print("here are all functions that write to the Datasets")
|
my_2_embedder.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import my_1_reader
|
| 2 |
+
import my_1_writer
|
| 3 |
+
import my_1_openai
|
| 4 |
+
import os
|
| 5 |
+
import openai
|
| 6 |
+
import pdf2image
|
| 7 |
+
from pdf2image import convert_from_path
|
| 8 |
+
from PIL import Image
|
| 9 |
+
import csv
|
| 10 |
+
import numpy as np
|
| 11 |
+
import os
|
| 12 |
+
import pdfminer
|
| 13 |
+
from pdf2image import convert_from_path
|
| 14 |
+
import csv
|
| 15 |
+
import numpy as np
|
| 16 |
+
import os
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# Assuming your my_1_openai's vectorize functions work as described
|
| 20 |
+
def vectorize_data(data):
|
| 21 |
+
# Replace this with your actual logic to vectorize text data
|
| 22 |
+
return np.random.rand(100).tolist() # Example vector
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def vectorize_image(data):
|
| 26 |
+
# Replace this with your actual logic to vectorize image data
|
| 27 |
+
return np.random.rand(100).tolist() # Example vector
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def vectorize_this_pdf_with_metadata(pdf_path, output_path, metadata_filename="DS_U3/U3_Metadaten.csv"):
|
| 31 |
+
tensor_description = {
|
| 32 |
+
"my_id": 89, # Example ID, ideally this should be dynamically generated
|
| 33 |
+
"og_name": pdf_path,
|
| 34 |
+
"metadata": {},
|
| 35 |
+
"vec_content_text": [],
|
| 36 |
+
"vec_content_img": []
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
# Read metadata from CSV and match by 'og_name' (pdf_path)
|
| 40 |
+
# 'latin1', 'ISO-8859-1', or 'cp1252'
|
| 41 |
+
with open(metadata_filename, mode='r', encoding='utf-8') as csvfile:
|
| 42 |
+
csv_reader = csv.DictReader(csvfile)
|
| 43 |
+
for row in csv_reader:
|
| 44 |
+
if row["Name"] == os.path.basename(pdf_path): # Assuming 'Name' is a column in your CSV
|
| 45 |
+
tensor_description['metadata'] = row
|
| 46 |
+
break
|
| 47 |
+
|
| 48 |
+
# get text content
|
| 49 |
+
text = my_1_reader.extract_text_from_pdf(pdf_path)
|
| 50 |
+
|
| 51 |
+
# Vectorize extracted text
|
| 52 |
+
if text:
|
| 53 |
+
tensor_description['vec_content_text'].append(vectorize_data(data=text))
|
| 54 |
+
|
| 55 |
+
# Convert PDF pages to images using pdf2image
|
| 56 |
+
images = convert_from_path(pdf_path)
|
| 57 |
+
for img in images:
|
| 58 |
+
# Assume vectorize_image expects a PIL image; pdf2image.convert_from_path already returns PIL images
|
| 59 |
+
img_vector = vectorize_image(data=img)
|
| 60 |
+
tensor_description['vec_content_img'].append(img_vector)
|
| 61 |
+
|
| 62 |
+
# Here, instead of saving the tensor, we'll simply print it as an example
|
| 63 |
+
print(tensor_description)
|
| 64 |
+
|
| 65 |
+
return tensor_description
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def vectorize_this_pdf_with_metadata_old(pdf_path, output_path, metadata_filename="DS_U3/U3_Metadaten.csv"):
|
| 69 |
+
# get PDF content, split into chunks
|
| 70 |
+
|
| 71 |
+
tensor_description = { #sample
|
| 72 |
+
"my_id": 89, #nummerate how often the func has been called
|
| 73 |
+
"og_name": pdf_path,
|
| 74 |
+
"metadata": {"a": 1, "b": 2, "c": 3}, # get from metadata_filename, get the full row, with the same filename in column A
|
| 75 |
+
"vec_content_text": [[0.03874, 0.03947, -0.0875], [-0.03234, 0.03437, -0.011234]], # vectorize all chunks of all the text in the PDF
|
| 76 |
+
# call my_1_openai.vectorize_data(data="string") this function returns the vector from Ada002 as a list
|
| 77 |
+
"vec_content_img": [[0.01234, 0.09875, -0.0542], [-0.02456, 0.03537, -0.016634]]
|
| 78 |
+
# for the images make every pdf into an img using pdf2image
|
| 79 |
+
# call my_1_openai.vecotrize_image(data=PIL_OBJ) this funciton should return the vector, of the image, comparable to text. write this funciton as well.
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
tensor = [] # make tensor from tensor_Description
|
| 83 |
+
return tensor
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def vectorize_pdfs(pdf_dict):
|
| 87 |
+
"""
|
| 88 |
+
Vectorize a pdf using openai API
|
| 89 |
+
|
| 90 |
+
Parameters:
|
| 91 |
+
- dataset: dictionary containing PDF files.
|
| 92 |
+
|
| 93 |
+
Returns:
|
| 94 |
+
- dictionary containing vectors
|
| 95 |
+
"""
|
| 96 |
+
vec_dataset = {}
|
| 97 |
+
for key in pdf_dict.keys():
|
| 98 |
+
try:
|
| 99 |
+
vector = my_1_openai.vectorize_data(pdf_dict[key])
|
| 100 |
+
except openai.error.InvalidRequestError as err:
|
| 101 |
+
print(err)
|
| 102 |
+
vector = [0, 0, 0]
|
| 103 |
+
vec_dataset[key] = str(vector)
|
| 104 |
+
return vec_dataset
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def vectorize_csv(csv_table, safe=False):
|
| 108 |
+
folder_name = ""
|
| 109 |
+
if safe:
|
| 110 |
+
folder_name = f"{csv_table}_vectorised/"
|
| 111 |
+
if not os.path.exists(folder_name):
|
| 112 |
+
os.makedirs(folder_name)
|
| 113 |
+
nb = 1
|
| 114 |
+
vec_dataset = []
|
| 115 |
+
for data_item in csv_table:
|
| 116 |
+
vector = my_1_openai.vectorize_data(data_item)
|
| 117 |
+
if safe:
|
| 118 |
+
with open(f"{folder_name}{csv_table}_vec.txt", "w") as f:
|
| 119 |
+
f.write(str(vector) + "\n")
|
| 120 |
+
print("csv_line"+str(nb))
|
| 121 |
+
nb += 1
|
| 122 |
+
vec_dataset.append(str(vector))
|
| 123 |
+
return vec_dataset
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def create_df(ds):
|
| 127 |
+
# my_df = {"name": [], "metadata": [], "body_text": []}
|
| 128 |
+
my_df, my_new_df = {}, {}
|
| 129 |
+
my_df["name"] = [filename for filename in os.listdir(ds) if filename.endswith('.pdf')]
|
| 130 |
+
my_df["metadata"] = my_1_reader.read_csv_lines_as_strings(ds + "_metadata.csv")[1:11]
|
| 131 |
+
my_df["text"] = list(my_1_reader.read_pdfs_from_folder(ds).values())
|
| 132 |
+
for e in my_df:
|
| 133 |
+
my_new_df[f"{e}_vec"] = [my_1_openai.vectorize_data(item) for item in my_df[e]]
|
| 134 |
+
for e in my_new_df:
|
| 135 |
+
my_df[str(e)] = my_new_df[e]
|
| 136 |
+
for e in my_df:
|
| 137 |
+
print(f"{e} {my_df[e][2]}")
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def create_vec_dataset(folder):
|
| 141 |
+
my_pdfs = my_1_reader.read_pdfs_from_folder(f"{folder}/PDF")
|
| 142 |
+
vectorize_then_safe_data(f"{folder}/vectors//names.json", my_pdfs.keys())
|
| 143 |
+
vectorize_then_safe_data(f"{folder}/vectors//texts.json", my_pdfs.values())
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# function to vectorize data=[]. then safes as json.
|
| 147 |
+
def vectorize_then_safe_data(file_name, data):
|
| 148 |
+
my_vec_words = []
|
| 149 |
+
for entry in data:
|
| 150 |
+
my_vec_words.append(my_1_openai.vectorize_data(entry))
|
| 151 |
+
my_dict = dict(zip(data, my_vec_words))
|
| 152 |
+
my_1_writer.safe_my_dict_as_json(file_name, my_dict)
|
| 153 |
+
print("vectorised data saved")
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def main():
|
| 157 |
+
# Example call to the function
|
| 158 |
+
pdf_path = 'DS_U3/Dokumente/E - Elektroanlagen/ISB-020-U3-W-E-01-B07005-001-040.pdf'
|
| 159 |
+
output_path = 'DS_U3/Dokumente_vec'
|
| 160 |
+
metadata_filename = 'DS_U3/U3_Metadaten.csv'
|
| 161 |
+
vectorize_this_pdf_with_metadata(pdf_path, output_path, metadata_filename)
|
| 162 |
+
|
| 163 |
+
if __name__ == "__main__":
|
| 164 |
+
print("this file contains embedding functions")
|
| 165 |
+
vec1 = vectorize_data("this is the test string")
|
| 166 |
+
vec2 = vectorize_data("this is the test string")
|
| 167 |
+
if vec1 == vec2:
|
| 168 |
+
print("same")
|
| 169 |
+
|
my_2_sim_search.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import my_1_openai
|
| 2 |
+
import my_1_writer
|
| 3 |
+
import json
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# sim search with dot_product and lin_distance
|
| 8 |
+
# the newly vectorized TERM will be added to the database
|
| 9 |
+
def sim_search(database, term, add_to_db=True, debug=False):
|
| 10 |
+
if type(term) == str:
|
| 11 |
+
print("str")
|
| 12 |
+
vector1 = my_1_openai.vectorize_data(term)
|
| 13 |
+
elif type(term) == list:
|
| 14 |
+
print("list")
|
| 15 |
+
vector1 = term
|
| 16 |
+
else:
|
| 17 |
+
print("invalid search_term/search_vector format")
|
| 18 |
+
return
|
| 19 |
+
with open(database, "r") as f:
|
| 20 |
+
table = json.load(f)
|
| 21 |
+
sim_search_dict = {}
|
| 22 |
+
for key in table.keys():
|
| 23 |
+
vector2 = table[key]
|
| 24 |
+
if debug:
|
| 25 |
+
print("")
|
| 26 |
+
print(f"{vector1}")
|
| 27 |
+
print(f"{vector2}")
|
| 28 |
+
print(f"doing dot product for {key} and {term}")
|
| 29 |
+
dp = np.dot(vector1, vector2)
|
| 30 |
+
distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
|
| 31 |
+
if debug:
|
| 32 |
+
print(f"the dp is {dp}")
|
| 33 |
+
print(f"the distance is{distance}")
|
| 34 |
+
print("")
|
| 35 |
+
print("")
|
| 36 |
+
print("")
|
| 37 |
+
sim_search_dict[key] = dp * distance
|
| 38 |
+
|
| 39 |
+
# sort with the biggest similarity
|
| 40 |
+
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
|
| 41 |
+
|
| 42 |
+
if debug:
|
| 43 |
+
for key, value in sorted_table[:5]:
|
| 44 |
+
print(f"{key}: {value}")
|
| 45 |
+
if add_to_db:
|
| 46 |
+
|
| 47 |
+
if term in table.keys():
|
| 48 |
+
print("the search term is in the database!")
|
| 49 |
+
# add the newly vectorized term to the words, if not already in the vector table
|
| 50 |
+
else:
|
| 51 |
+
if database != "session/my_words_vec_table.json":
|
| 52 |
+
database = "session/my_vecs.json"
|
| 53 |
+
# table = load_df(database) # ??
|
| 54 |
+
table[str(term)] = vector1
|
| 55 |
+
my_1_writer.safe_my_dict_as_json(database, table)
|
| 56 |
+
# first_key, first_value = list(sortedTable.items())[0]
|
| 57 |
+
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
|
| 58 |
+
return sorted_table
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def dot_p_to_1(database, vector1=0, analysis_filename=0):
|
| 62 |
+
|
| 63 |
+
with open(database, "r") as f:
|
| 64 |
+
table = json.load(f)
|
| 65 |
+
dot_product_to1 = {}
|
| 66 |
+
|
| 67 |
+
if vector1 == 0:
|
| 68 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
| 69 |
+
elif vector1 == 1:
|
| 70 |
+
vector1 = table[str(list(table.keys())[0])]
|
| 71 |
+
|
| 72 |
+
for key in table.keys():
|
| 73 |
+
dot_product_to1[key] = np.dot(vector1, table[key])
|
| 74 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
|
| 75 |
+
print("dot p to 1 saved")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def lin_dist(database, vector1=0, analysis_filename=0):
|
| 79 |
+
with open(database, "r") as f:
|
| 80 |
+
table = json.load(f)
|
| 81 |
+
lin_dist_to_1 = {}
|
| 82 |
+
|
| 83 |
+
if vector1 == 0:
|
| 84 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
| 85 |
+
elif vector1 == 1:
|
| 86 |
+
vector1 = table[str(list(table.keys())[0])]
|
| 87 |
+
|
| 88 |
+
for key in table.keys():
|
| 89 |
+
lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
|
| 90 |
+
|
| 91 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
|
| 92 |
+
print("lin dist to 1 saved")
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def manhattan_dist(database, vector1=0, analysis_filename=0):
|
| 96 |
+
with open(database, "r") as f:
|
| 97 |
+
table = json.load(f)
|
| 98 |
+
manhattan_dist_to_1 = {}
|
| 99 |
+
|
| 100 |
+
if vector1 == 0:
|
| 101 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
| 102 |
+
elif vector1 == 1:
|
| 103 |
+
vector1 = table[str(list(table.keys())[0])]
|
| 104 |
+
|
| 105 |
+
for key in table.keys():
|
| 106 |
+
manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
|
| 107 |
+
|
| 108 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
|
| 109 |
+
print("manhattan dist to 1 saved")
|