Spaces:
Sleeping
Sleeping
| import csv | |
| import tiktoken | |
| questions_and_passages = [] | |
| with open("sample.csv") as f: | |
| reader = csv.reader(f) | |
| next(f) | |
| for row in reader: | |
| entry = [] | |
| entry.append((row[1] + " " + row[2]).strip()) # optional prompt + question | |
| entry.append(row[9]) # first of 10 passages | |
| for i in range(10, 19): entry[1] += " " + row[i] # next 9 passages all separated with a space | |
| questions_and_passages.append(entry) | |
| enc = tiktoken.encoding_for_model("gpt-4o") | |
| question_tokens = 0 | |
| question_passage_tokens = 0 | |
| max_qt = 0 | |
| max_pt = 0 | |
| max_qpt = 0 | |
| for entry in questions_and_passages: | |
| qt = len(enc.encode(entry[0])) | |
| question_tokens += qt | |
| if qt > max_qt: max_qt = qt | |
| pt = len(enc.encode(entry[1])) | |
| question_passage_tokens += qt + pt | |
| if pt > max_pt: max_pt = pt | |
| if qt + pt > max_qpt: max_qpt = qt + pt | |
| print("Average question length, gpt-4o tokens: " + str(question_tokens / len(questions_and_passages))) | |
| print("Longest question (tokens): " + str(max_qt)) | |
| print("Average question + 10 passages length, gpt-4o tokens: " + str(question_passage_tokens / len(questions_and_passages))) | |
| print("Longest set of ten passages (tokens): " + str(max_pt)) | |
| print("Longest combination of question and passages: " + str(max_qpt)) |