autoahxan commited on
Commit
d51140f
·
verified ·
1 Parent(s): dc3056f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ .gradio/flagged/Upload[[:space:]]Excel[[:space:]]File/2aeafc769523b0a05452/Quick_and_rough_list_for_Ahsan_initial_trials[[:space:]]-[[:space:]]10th[[:space:]]Dec[[:space:]]2024.xlsx filter=lfs diff=lfs merge=lfs -text
37
+ .gradio/flagged/Upload[[:space:]]Excel[[:space:]]File/6c7338224e14819dfdc1/Quick_and_rough_list_for_Ahsan_initial_trials[[:space:]]-[[:space:]]10th[[:space:]]Dec[[:space:]]2024.xlsx filter=lfs diff=lfs merge=lfs -text
38
+ .gradio/flagged/Upload[[:space:]]Excel[[:space:]]File/fc94c399c41af885dce4/Quick_and_rough_list_for_Ahsan_initial_trials[[:space:]]-[[:space:]]10th[[:space:]]Dec[[:space:]]2024.xlsx filter=lfs diff=lfs merge=lfs -text
39
+ data/Quick_and_rough_list_for_Ahsan_initial_trials[[:space:]]-[[:space:]]10th[[:space:]]Dec[[:space:]]2024.xlsx filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .env
2
+ __pycache__
3
+ llama3.2-vision.ipynb
4
+ .DS_Store
5
+ data/*
6
+ __pycache__
7
+ download_app.py
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
.gradio/flagged/Download Processed Excel/9824a6c316a560c7dada/20250117_103737_processed_output.xlsx ADDED
Binary file (7.25 kB). View file
 
.gradio/flagged/Upload Excel File/2aeafc769523b0a05452/Quick_and_rough_list_for_Ahsan_initial_trials - 10th Dec 2024.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:526098f685db9b7e7223b763e0efa72069f70cff7b9f0a0bb9f334a7fb91fa4e
3
+ size 139903
.gradio/flagged/Upload Excel File/6c7338224e14819dfdc1/Quick_and_rough_list_for_Ahsan_initial_trials - 10th Dec 2024.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0e322cb97a467369a9f86558bf27d9d1db18e48e869c6499af8c8bc76124ea7
3
+ size 122490
.gradio/flagged/Upload Excel File/fc94c399c41af885dce4/Quick_and_rough_list_for_Ahsan_initial_trials - 10th Dec 2024.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:526098f685db9b7e7223b763e0efa72069f70cff7b9f0a0bb9f334a7fb91fa4e
3
+ size 139903
.gradio/flagged/dataset1.csv ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Upload Excel File,Enter Prompt,Evaluation Metrics,Download Processed Excel,timestamp
2
+ .gradio/flagged/Upload Excel File/2aeafc769523b0a05452/Quick_and_rough_list_for_Ahsan_initial_trials - 10th Dec 2024.xlsx,"You are a teacher of philosophy of science.
3
+ I want you to assess whether the use of the expressions ""to prove"", ""proving"", ""proof"", ""proved"", in the provided passages,
4
+ is appropriate, accurate, and logically consistent from the point of view of the rigorous mathematical meaning of the concept of 'proof'.
5
+
6
+ Consider statements of the type ""X cannot be proved"" or ""there is no proof for X"" as appropriate even when they are applied to something
7
+ for which the concept of 'proof' is not appropriate. Consider questions involving the concept of 'proof' as appropriate even when they are
8
+ applied to something for which the concept of 'proof' is not appropriate. You need to respond with a ""+1"" if the use of those
9
+ expressions is appropriate, accurate, and logically consistent and with a ""-1"" otherwise. Proving something wrong is also acceptable.","Error: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].",,2025-01-17 07:49:32.632581
10
+ .gradio/flagged/Upload Excel File/fc94c399c41af885dce4/Quick_and_rough_list_for_Ahsan_initial_trials - 10th Dec 2024.xlsx,"You are a teacher of philosophy of science.
11
+ I want you to assess whether the use of the expressions 'to prove', 'proving', 'proof', 'proved', in the provided passages,
12
+ is appropriate, accurate, and logically consistent from the point of view of the rigorous mathematical meaning of the concept of 'proof'.
13
+
14
+ Consider statements of the type 'X cannot be proved' or 'there is no proof for X' as appropriate even when they are applied to something
15
+ for which the concept of 'proof' is not appropriate. Consider questions involving the concept of 'proof' as appropriate even when they are
16
+ applied to something for which the concept of 'proof' is not appropriate. You need to respond with a '+1' if the use of those
17
+ expressions is appropriate, accurate, and logically consistent and with a '-1' otherwise. Proving something wrong is also acceptable.","'
18
+ <div style=""font-family: Arial, sans-serif;"">
19
+ <h3 style=""color: #2b7a78;"">Evaluation Metrics</h3>
20
+ <p><strong style=""color: #d9534f;"">Accuracy:</strong>
21
+ <span style=""font-size: 1.2em; font-weight: bold; color: #5cb85c;"">0.33</span></p>
22
+ <p><strong>Precision:</strong> N/A</p>
23
+ <p><strong>Recall:</strong> N/A</p>
24
+ <p><strong>F1 Score:</strong> N/A</p>
25
+ <p><strong>Confusion Matrix:</strong><br><table class=""dataframe table table-bordered table-striped"">
26
+ <thead>
27
+ <tr style=""text-align: right;"">
28
+ <th></th>
29
+ <th>Predicted -1.0</th>
30
+ <th>Predicted 0.0</th>
31
+ <th>Predicted 1.0</th>
32
+ </tr>
33
+ </thead>
34
+ <tbody>
35
+ <tr>
36
+ <th>Class -1.0</th>
37
+ <td>3</td>
38
+ <td>0</td>
39
+ <td>0</td>
40
+ </tr>
41
+ <tr>
42
+ <th>Class 0.0</th>
43
+ <td>2</td>
44
+ <td>0</td>
45
+ <td>0</td>
46
+ </tr>
47
+ <tr>
48
+ <th>Class 1.0</th>
49
+ <td>4</td>
50
+ <td>0</td>
51
+ <td>0</td>
52
+ </tr>
53
+ </tbody>
54
+ </table></p>
55
+ </div>
56
+ ",.gradio/flagged/Download Processed Excel/9824a6c316a560c7dada/20250117_103737_processed_output.xlsx,2025-01-17 10:41:32.033493
57
+ .gradio/flagged/Upload Excel File/6c7338224e14819dfdc1/Quick_and_rough_list_for_Ahsan_initial_trials - 10th Dec 2024.xlsx,"You are a teacher of philosophy of science.
58
+ I want you to assess whether the use of the expressions 'to prove', 'proving', 'proof', 'proved', in the provided passages, is appropriate, accurate, and logically consistent from the point of view of the rigorous mathematical meaning of the concept of 'proof'.
59
+
60
+ Consider statements of the type 'X cannot be proved' or 'there is no proof for X' as appropriate even when they are applied to something for which the concept of 'proof' is not appropriate. Consider questions involving the concept of 'proof' as appropriate even when they are applied to something for which the concept of 'proof' is not appropriate. You need to respond with a '+1' if the use of those expressions is appropriate, accurate, and logically consistent and with a '-1' otherwise. Proving something wrong is also acceptable.",,,2025-01-18 10:29:24.783349
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Kwik Ai
3
- emoji: 💻
4
- colorFrom: red
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 5.29.0
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: kwik-ai
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 5.12.0
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+ import os
4
+ import json
5
+ import glob
6
+ import gradio as gr
7
+ import pandas as pd
8
+ from datetime import datetime
9
+ from functions import evaluate_dataframe, run_openai_inference
10
+ import openpyxl
11
+ from openpyxl.styles import Alignment
12
+ import traceback
13
+
14
+ # Ensure the output directory exists
15
+ OUTPUT_DIR = "./data/outputs"
16
+ if not os.path.exists(OUTPUT_DIR):
17
+ os.makedirs(OUTPUT_DIR)
18
+
19
+ # Functions for Analysis Interface
20
+ def process_dataframe(df, prompt: str, model: str, max_iterations: int = 5):
21
+ print("Starting process_dataframe function...")
22
+ df['passage'] = (
23
+ df['LeftContext'].astype(str) +
24
+ " <expression>" + df['Keyword'].astype(str) + "</expression> " +
25
+ df['RightContext'].astype(str)
26
+ )
27
+ tasks = []
28
+ indices = []
29
+ print(f"Iterating over rows in random order...")
30
+ for idx, row in df.sample(frac=1.0).iterrows():
31
+ if len(tasks) >= max_iterations:
32
+ print(f"Max iterations reached: {max_iterations}")
33
+ break
34
+ if pd.isna(row['Category']):
35
+ print(f"Skipping row {idx} due to missing 'Category'")
36
+ continue
37
+ indices.append(idx)
38
+ print(f"Scheduling task for row {idx} with passage: {row['passage']}")
39
+ tasks.append(run_openai_inference(prompt, row['passage'], model))
40
+ print(f"Running inference for {len(tasks)} tasks...")
41
+ results = [task for task in tasks]
42
+ print(f"Assigning results to corresponding rows...")
43
+ for i, idx in enumerate(indices):
44
+ score, reason = results[i]
45
+ print(f"Row {idx}: Assigned score: {score}, reason: {reason}")
46
+ df.at[idx, 'Prediction'] = score
47
+ df.at[idx, 'Prediction Reason'] = reason
48
+ print(f"Dropping rows with missing predictions...")
49
+ df_out = df.dropna(subset=['Prediction'])
50
+ print("Finished processing dataframe.")
51
+ return df_out
52
+
53
+ def process_excel(file, prompt, model, slider_value):
54
+ try:
55
+ print("Reading Excel file...")
56
+ df = pd.read_excel(file.name)
57
+ print(f"Excel file read successfully. DataFrame shape: {df.shape}")
58
+ print("Processing DataFrame...")
59
+ df_out = process_dataframe(df, prompt, model, max_iterations=slider_value)
60
+ print(f"DataFrame processed. Output shape: {df_out.shape}")
61
+ print("Evaluating DataFrame...")
62
+ evaluation_results = evaluate_dataframe(df_out['Category'], df_out['Prediction'])
63
+ print(f"Evaluation results: {evaluation_results}")
64
+ print("Generating file paths and timestamps...")
65
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
66
+ output_file = os.path.join(OUTPUT_DIR, f"{timestamp}_processed_output.xlsx")
67
+ metadata_file = os.path.join(OUTPUT_DIR, f"{timestamp}_metadata.json")
68
+ print(f"Output file path: {output_file}")
69
+ print(f"Metadata file path: {metadata_file}")
70
+ print("Writing Excel file with multiple sheets...")
71
+ with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
72
+ df.to_excel(writer, sheet_name='Data', index=False)
73
+ inputs_df = pd.DataFrame({
74
+ 'Parameter': ['Prompt', 'Model', 'Samples Processed'],
75
+ 'Value': [prompt, model, slider_value]
76
+ })
77
+ inputs_df.to_excel(writer, sheet_name='Inputs', index=False)
78
+ outputs_df = pd.DataFrame({
79
+ 'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Samples Classified'],
80
+ 'Value': [
81
+ evaluation_results.get('accuracy'),
82
+ evaluation_results.get('precision'),
83
+ evaluation_results.get('recall'),
84
+ evaluation_results.get('f1'),
85
+ df_out.shape[0]
86
+ ]
87
+ })
88
+ outputs_df.to_excel(writer, sheet_name='Outputs', index=False)
89
+ print(f"Excel file written to: {output_file}")
90
+ print("Applying formatting to Excel file...")
91
+ wb = openpyxl.load_workbook(output_file)
92
+ for sheet in wb.worksheets:
93
+ for row in sheet.iter_rows():
94
+ for cell in row:
95
+ cell.alignment = Alignment(wrap_text=True)
96
+ for row_idx in range(1, sheet.max_row + 1):
97
+ sheet.row_dimensions[row_idx].height = 75
98
+ wb.save(output_file)
99
+ print(f"Formatting applied and file saved: {output_file}")
100
+ print("Preparing evaluation results for display...")
101
+ accuracy = f"{evaluation_results.get('accuracy'):.2f}" if evaluation_results.get('accuracy') is not None else "N/A"
102
+ precision = f"{evaluation_results.get('precision'):.2f}" if evaluation_results.get('precision') is not None else "N/A"
103
+ recall = f"{evaluation_results.get('recall'):.2f}" if evaluation_results.get('recall') is not None else "N/A"
104
+ f1_score = f"{evaluation_results.get('f1'):.2f}" if evaluation_results.get('f1') is not None else "N/A"
105
+ conf_matrix = evaluation_results.get('conf_matrix', 'N/A')
106
+ eval_display = f"""
107
+ <div style="font-family: Arial, sans-serif;">
108
+ <h3 style="color: #2b7a78;">Evaluation Metrics</h3>
109
+ <p><strong style="color: #d9534f;">Accuracy:</strong>
110
+ <span style="font-size: 1.2em; font-weight: bold; color: #5cb85c;">{accuracy}</span></p>
111
+ <p><strong>Precision:</strong> {precision}</p>
112
+ <p><strong>Recall:</strong> {recall}</p>
113
+ <p><strong>F1 Score:</strong> {f1_score}</p>
114
+ <p><strong>Confusion Matrix:</strong><br>{conf_matrix}</p>
115
+ </div>
116
+ """
117
+ print(f"Evaluation results prepared for display.")
118
+ print("Saving run metadata...")
119
+ history_data = {
120
+ 'date_of_run': timestamp,
121
+ 'input_filename': os.path.basename(file.name),
122
+ 'output_filename': os.path.basename(output_file),
123
+ 'prompt': prompt,
124
+ 'model': model,
125
+ 'samples_requested': slider_value,
126
+ 'samples_classified': df_out.shape[0],
127
+ 'accuracy': evaluation_results.get('accuracy')
128
+ }
129
+ with open(metadata_file, "w") as f:
130
+ json.dump(history_data, f)
131
+ print(f"Run metadata saved to: {metadata_file}")
132
+ return eval_display, output_file
133
+ except Exception as e:
134
+ error_message = f"<div style='color: red;'>Error: {str(e)}</div>"
135
+ traceback.print_exc()
136
+ return error_message, None
137
+
138
+ def load_history():
139
+ history_entries = []
140
+ for meta_file in sorted(glob.glob(os.path.join(OUTPUT_DIR, "*_metadata.json")), reverse=True):
141
+ try:
142
+ with open(meta_file, "r") as f:
143
+ data = json.load(f)
144
+ excel_filename = data.get('output_filename', '')
145
+ download_link = f"<a href='{os.path.join(OUTPUT_DIR, excel_filename)}' download>{excel_filename}</a>"
146
+ history_entries.append({
147
+ "Date": data.get("date_of_run", ""),
148
+ "Input Filename": data.get("input_filename", ""),
149
+ "Output Filename": excel_filename,
150
+ "Prompt": data.get("prompt", ""),
151
+ "Model": data.get("model", ""),
152
+ "Samples Requested": data.get("samples_requested", ""),
153
+ "Samples Classified": data.get("samples_classified", ""),
154
+ "Accuracy": f"{data.get('accuracy'):.2f}" if data.get('accuracy') is not None else "N/A",
155
+ "Download": download_link
156
+ })
157
+ except Exception as ex:
158
+ print(f"Error loading history from {meta_file}: {ex}")
159
+ if history_entries:
160
+ table_html = """
161
+ <style>
162
+ .history-table {
163
+ border-collapse: collapse;
164
+ width: 100%;
165
+ font-family: Arial, sans-serif;
166
+ }
167
+ .history-table th, .history-table td {
168
+ padding: 8px;
169
+ text-align: left;
170
+ border: 1px solid #ddd;
171
+ }
172
+ .history-table th.prompt-col, .history-table td.prompt-col {
173
+ min-width: 300px;
174
+ }
175
+ .history-table td span {
176
+ cursor: help;
177
+ }
178
+ </style>
179
+ <table class='history-table'>
180
+ <tr>
181
+ """
182
+ for col in history_entries[0].keys():
183
+ if col == "Prompt":
184
+ table_html += f"<th class='prompt-col'>{col}</th>"
185
+ else:
186
+ table_html += f"<th>{col}</th>"
187
+ table_html += "</tr>"
188
+ for entry in history_entries:
189
+ table_html += "<tr>"
190
+ for key, value in entry.items():
191
+ if key == "Prompt":
192
+ full_prompt = value
193
+ clipped = full_prompt if len(full_prompt) <= 100 else full_prompt[:100] + "..."
194
+ table_html += f"<td class='prompt-col'><span title='{full_prompt}'>{clipped}</span></td>"
195
+ else:
196
+ table_html += f"<td>{value}</td>"
197
+ table_html += "</tr>"
198
+ table_html += "</table>"
199
+ else:
200
+ table_html = "<p>No history available.</p>"
201
+ return table_html
202
+
203
+ # Functions for Other Interface
204
+ def classify_text(text, model):
205
+ """Placeholder function for text classification."""
206
+ return f"Classified as: {model} - {text}"
207
+
208
+ # Interface Creation Functions
209
+ def create_analysis_interface():
210
+ default_prompt = (
211
+ "You are a teacher of philosophy of science. \n"
212
+ "I want you to assess whether the use of the expressions 'to prove', 'proving', 'proof', 'proved', in the provided passages, \n"
213
+ "is appropriate, accurate, and logically consistent from the point of view of the rigorous mathematical meaning of the concept of 'proof'. \n\n"
214
+ "Consider statements of the type 'X cannot be proved' or 'there is no proof for X' as appropriate even when they are applied to something \n"
215
+ "for which the concept of 'proof' is not appropriate. Consider questions involving the concept of 'proof' as appropriate even when they are \n"
216
+ "applied to something for which the concept of 'proof' is not appropriate. You need to respond with a '+1' if the use of those \n"
217
+ "expressions is appropriate, accurate, and logically consistent and with a '-1' otherwise. Proving something wrong is also acceptable."
218
+ )
219
+ with gr.Blocks() as demo:
220
+ gr.Markdown("# AI Text Analysis with History")
221
+ with gr.Row():
222
+ file_input = gr.File(label="Upload Excel File")
223
+ prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Type something here...", value=default_prompt, lines=10)
224
+ with gr.Row():
225
+ model_input = gr.Dropdown(label="Select Model", choices=["gpt-4o", "gpt-4o-mini"], value="gpt-4o-mini")
226
+ slider_input = gr.Slider(label="Number of Rows to Process", minimum=5, maximum=100, step=5, value=5)
227
+ submit_btn = gr.Button("Process File")
228
+ eval_output = gr.HTML(label="Evaluation Metrics")
229
+ file_output = gr.File(label="Download Processed Excel")
230
+ submit_btn.click(fn=process_excel, inputs=[file_input, prompt_input, model_input, slider_input], outputs=[eval_output, file_output])
231
+ gr.Markdown("## History")
232
+ refresh_btn = gr.Button("Refresh History")
233
+ history_output = gr.HTML(label="History Table")
234
+ refresh_btn.click(fn=load_history, inputs=[], outputs=history_output)
235
+ return demo
236
+
237
+ def create_inference_interface():
238
+ default_prompt = (
239
+ "You are a teacher of philosophy of science. \n"
240
+ "I want you to assess whether the use of the expressions 'to prove', 'proving', 'proof', 'proved', in the provided passages, \n"
241
+ "is appropriate, accurate, and logically consistent from the point of view of the rigorous mathematical meaning of the concept of 'proof'. \n\n"
242
+ "Consider statements of the type 'X cannot be proved' or 'there is no proof for X' as appropriate even when they are applied to something \n"
243
+ "for which the concept of 'proof' is not appropriate. Consider questions involving the concept of 'proof' as appropriate even when they are \n"
244
+ "applied to something for which the concept of 'proof' is not appropriate. You need to respond with a '+1' if the use of those \n"
245
+ "expressions is appropriate, accurate, and logically consistent and with a '-1' otherwise. Proving something wrong is also acceptable."
246
+ )
247
+ with gr.Blocks() as demo:
248
+ gr.Markdown("# AI Text Inference Interface")
249
+ with gr.Row():
250
+ file_input = gr.File(label="Upload Excel File")
251
+ prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Type something here...", value=default_prompt, lines=10)
252
+ with gr.Row():
253
+ model_input = gr.Dropdown(label="Select Model", choices=["gpt-4o", "gpt-4o-mini"], value="gpt-4o-mini")
254
+ slider_input = gr.Slider(label="Number of Rows to Process", minimum=5, maximum=100, step=5, value=5)
255
+ submit_btn = gr.Button("Process File")
256
+ eval_output = gr.HTML(label="Evaluation Metrics")
257
+ file_output = gr.File(label="Download Processed Excel")
258
+ submit_btn.click(fn=process_excel, inputs=[file_input, prompt_input, model_input, slider_input], outputs=[eval_output, file_output])
259
+ gr.Markdown("## History")
260
+ refresh_btn = gr.Button("Refresh History")
261
+ history_output = gr.HTML(label="History Table")
262
+ refresh_btn.click(fn=load_history, inputs=[], outputs=history_output)
263
+ return demo
264
+
265
+ # Main Execution
266
+ if __name__ == "__main__":
267
+ interface_type = os.getenv("INTERFACE_TYPE") # Default to "analysis" if not set
268
+ if interface_type == "analysis":
269
+ static_paths = [OUTPUT_DIR]
270
+ gr.set_static_paths(paths=static_paths)
271
+ print(f"static files will be served from folder: {static_paths}")
272
+ demo = create_analysis_interface()
273
+ elif interface_type == "inference":
274
+ demo = create_inference_interface()
275
+ else:
276
+ raise ValueError(f"Invalid INTERFACE_TYPE: {interface_type}")
277
+ demo.launch(share=True)
278
+
279
+ # done merging. let's see
functions.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+
4
+ import os
5
+ import json
6
+ import pandas as pd
7
+ from datetime import datetime
8
+ import openpyxl
9
+ from openpyxl.utils import get_column_letter
10
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
11
+
12
+ # Set up global paths and history file
13
+ OUTPUT_DIR = "./data/outputs"
14
+ HISTORY_FILE = "./data/history.json"
15
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
16
+ if not os.path.exists("./data"):
17
+ os.makedirs("./data", exist_ok=True)
18
+
19
+ # Load history if exists, otherwise initialize an empty dictionary
20
+ if os.path.exists(HISTORY_FILE):
21
+ with open(HISTORY_FILE, "r") as f:
22
+ history = json.load(f)
23
+ else:
24
+ history = {}
25
+
26
+ # Import the OpenAI library.
27
+ import openai
28
+ client = openai
29
+
30
+ def run_openai_inference(prompt: str, passage: str, model: str):
31
+ passage_prompt = f"""
32
+ Here is the passage you need to analyze:
33
+ <passage>
34
+ {passage}
35
+ </passage>
36
+ """
37
+
38
+
39
+ system_prompt = f"{prompt}\n\n{passage_prompt}"
40
+
41
+ # print(f"passage_prompt: {system_prompt}", end="\n\n")
42
+
43
+ format = """
44
+ Based on the identified type, extract and return the following data:
45
+ - score
46
+ **Output format:**
47
+ { "score": "return numeric score here", "reason": "return a short one liner reason for your score here" }
48
+ """
49
+
50
+ # print(f"system prompt is: {system_prompt}")
51
+
52
+ completion = client.chat.completions.create(
53
+ model=model,
54
+ messages=[
55
+ {"role": "system", "content": system_prompt},
56
+ {"role": "user", "content": format},
57
+ ]
58
+ )
59
+
60
+ # print(completion.choices[0].message.content)
61
+ try:
62
+ score = int(eval(completion.choices[0].message.content)['score'])
63
+ reason = eval(completion.choices[0].message.content)['reason']
64
+ except:
65
+ score = None
66
+ reason = None
67
+
68
+ return score, reason
69
+
70
+ def process_dataframe(df, prompt: str, model: str, max_iterations: int = 5):
71
+ print("Starting process_dataframe function...")
72
+
73
+ # Create a new column 'passage' by combining existing columns.
74
+ print("Creating 'passage' column...")
75
+ df['passage'] = (
76
+ df['LeftContext'].astype(str) +
77
+ " <expression>" + df['Keyword'].astype(str) + "</expression> " +
78
+ df['RightContext'].astype(str)
79
+ )
80
+
81
+ tasks = []
82
+ indices = [] # store row indices corresponding to scheduled tasks
83
+ print(f"Iterating over rows in random order...")
84
+
85
+ # Iterate over rows in random order
86
+ for idx, row in df.sample(frac=1.0).iterrows():
87
+ if len(tasks) >= max_iterations:
88
+ print(f"Max iterations reached: {max_iterations}")
89
+ break
90
+ if pd.isna(row['Category']):
91
+ print(f"Skipping row {idx} due to missing 'Category'")
92
+ continue
93
+ indices.append(idx)
94
+ print(f"Scheduling task for row {idx} with passage: {row['passage']}")
95
+ tasks.append(run_openai_inference(prompt, row['passage'], model))
96
+
97
+ # Run the inference and collect results
98
+ print(f"Running inference for {len(tasks)} tasks...")
99
+ results = [task for task in tasks] # You can replace this line with asyncio.gather to run tasks concurrently
100
+
101
+ # Assign the results to the corresponding rows in the DataFrame
102
+ print(f"Assigning results to corresponding rows...")
103
+ for i, idx in enumerate(indices):
104
+ score, reason = results[i]
105
+ print(f"Row {idx}: Assigned score: {score}, reason: {reason}")
106
+ df.at[idx, 'Prediction'] = score
107
+ df.at[idx, 'Prediction Reason'] = reason
108
+
109
+ # Remove rows with missing predictions
110
+ print(f"Dropping rows with missing predictions...")
111
+ df_out = df.dropna(subset=['Prediction'])
112
+
113
+ print("Finished processing dataframe.")
114
+ return df_out
115
+
116
+
117
+ def evaluate_dataframe(y_true, y_pred):
118
+ try:
119
+ accuracy = accuracy_score(y_true, y_pred)
120
+ except:
121
+ accuracy = None
122
+ try:
123
+ precision = precision_score(y_true, y_pred, average='binary')
124
+ except:
125
+ precision = None
126
+ try:
127
+ recall = recall_score(y_true, y_pred, average='binary')
128
+ except:
129
+ recall = None
130
+ try:
131
+ f1 = f1_score(y_true, y_pred, average='binary')
132
+ except:
133
+ f1 = None
134
+ try:
135
+ cm = confusion_matrix(y_true, y_pred)
136
+ conf_matrix = cm.tolist() # convert to list for easier JSON serialization
137
+ except:
138
+ conf_matrix = None
139
+ return {
140
+ "accuracy": accuracy,
141
+ "precision": precision,
142
+ "recall": recall,
143
+ "f1": f1,
144
+ "conf_matrix": conf_matrix
145
+ }
146
+
147
+ def save_results(df_out, prompt, model):
148
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
149
+ output_file = os.path.join(OUTPUT_DIR, f"{timestamp}_processed.xlsx")
150
+ # Create a DataFrame for the prompt/model info
151
+ df_prompt = pd.DataFrame({"Prompt": [prompt], "Model": [model]})
152
+ # Write the outputs and inputs to separate sheets using Pandas ExcelWriter
153
+ with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
154
+ df_out.to_excel(writer, sheet_name="Outputs", index=False)
155
+ df_prompt.to_excel(writer, sheet_name="Inputs", index=False)
156
+ # No need to call writer.save() here—the context manager handles it.
157
+
158
+ # Adjust column widths in both sheets using openpyxl
159
+ wb = openpyxl.load_workbook(output_file)
160
+ for sheet_name, df in [("Outputs", df_out), ("Inputs", df_prompt)]:
161
+ ws = wb[sheet_name]
162
+ for idx, col in enumerate(df.columns, 1):
163
+ max_length = max((len(str(cell)) for cell in df[col].values), default=0)
164
+ max_length = max(max_length, len(col)) + 2
165
+ col_letter = get_column_letter(idx)
166
+ ws.column_dimensions[col_letter].width = max_length
167
+ wb.save(output_file)
168
+
169
+ # Update history with run details
170
+ history[timestamp] = {
171
+ "file": output_file,
172
+ "prompt": prompt,
173
+ "model": model,
174
+ "score": df_out['Prediction'].mean() if not df_out['Prediction'].empty else None,
175
+ "samples": len(df_out)
176
+ }
177
+ with open(HISTORY_FILE, "w") as f:
178
+ json.dump(history, f, indent=4)
179
+ return output_file
180
+
181
+ def list_previous_files():
182
+ # Return the history of processed files as a dictionary
183
+ return history
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ python-dotenv==1.0.1
2
+ pandas==2.2.3
3
+ scikit-learn==1.6.1
4
+ openai==1.59.7
5
+ openpyxl==3.1.5
testing.ipynb ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "filename = \"./data/Quick_and_rough_list_for_Ahsan_initial_trials - 10th Dec 2024.xlsx\"\n",
11
+ "df = pd.read_excel(filename)"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 13,
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "data": {
21
+ "text/plain": [
22
+ "-1"
23
+ ]
24
+ },
25
+ "execution_count": 13,
26
+ "metadata": {},
27
+ "output_type": "execute_result"
28
+ }
29
+ ],
30
+ "source": []
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 1,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "import os\n",
39
+ "os.environ['OPENAI_API_KEY'] = api_key='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'"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 2,
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "from openai import OpenAI\n",
49
+ "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report\n",
50
+ "import pandas as pd\n",
51
+ "from tqdm import tqdm"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 32,
57
+ "metadata": {},
58
+ "outputs": [
59
+ {
60
+ "name": "stderr",
61
+ "output_type": "stream",
62
+ "text": [
63
+ "Computing AI Outputs: 1%| | 5/842 [00:01<05:16, 2.65it/s]"
64
+ ]
65
+ },
66
+ {
67
+ "name": "stdout",
68
+ "output_type": "stream",
69
+ "text": [
70
+ "passage: or how we can stand on this earth. Science is just one of the methods which can help us to <expression>prove </expression> that our ideas are correct and true. Accordingly, it is dangerous to rely on science to decide everything. Also, \n",
71
+ "gtruth: -1.0, pred: -1, pred_reason: The use of 'prove' in this context is not rigorous or appropriate, as science typically deals with empirical evidence rather than mathematical proof.\n",
72
+ "\n"
73
+ ]
74
+ },
75
+ {
76
+ "name": "stderr",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "Computing AI Outputs: 2%|▏ | 15/842 [00:06<06:16, 2.20it/s]"
80
+ ]
81
+ },
82
+ {
83
+ "name": "stdout",
84
+ "output_type": "stream",
85
+ "text": [
86
+ "passage: culture, scientific facts could be repeated infinite times. Take Galileo’s famous experiment on falling bodies as an example, to <expression>prove </expression> that the rate of a falling body is independent from its mess, Galileo dropped two cannonballs of different weights \n",
87
+ "gtruth: -1.0, pred: -1, pred_reason: The term 'prove' is used incorrectly as scientific experiments can support theories but do not provide definitive proof in the mathematical sense.\n",
88
+ "\n"
89
+ ]
90
+ },
91
+ {
92
+ "name": "stderr",
93
+ "output_type": "stream",
94
+ "text": [
95
+ "Computing AI Outputs: 2%|▏ | 16/842 [00:08<07:36, 1.81it/s]"
96
+ ]
97
+ },
98
+ {
99
+ "name": "stdout",
100
+ "output_type": "stream",
101
+ "text": [
102
+ "passage: seen as a rational, impartial, neutral bystander when facing social issues. It poses to us facts and numerical evidence to <expression>prove </expression> hypotheses and clear guesses or stereotypes. But is sticking to science enough? Scientific advancements have profoundly transformed our society, \n",
103
+ "gtruth: -1.0, pred: -1, pred_reason: The use of 'prove' in this context is inappropriate and inaccurate as it suggests a definitive proof typical of mathematics, whereas scientific hypotheses are generally subject to revision and do not reach absolute proof.\n",
104
+ "\n"
105
+ ]
106
+ },
107
+ {
108
+ "name": "stderr",
109
+ "output_type": "stream",
110
+ "text": [
111
+ "Computing AI Outputs: 2%|▏ | 19/842 [00:09<07:04, 1.94it/s]"
112
+ ]
113
+ },
114
+ {
115
+ "name": "stdout",
116
+ "output_type": "stream",
117
+ "text": [
118
+ "passage: pursuing scientific progress and maintaining the humanity and integrity of the law. Meanwhile, as the rapid development of science may <expression>prove </expression> that some values of society, even those preserved in law, are plainly wrong, the legal education must also highlight \n",
119
+ "gtruth: -1.0, pred: -1, pred_reason: The term 'prove' is misapplied as it implies a mathematical certainty rather than a scientific or ethical assertion.\n",
120
+ "\n"
121
+ ]
122
+ },
123
+ {
124
+ "name": "stderr",
125
+ "output_type": "stream",
126
+ "text": [
127
+ "Computing AI Outputs: 3%|▎ | 26/842 [00:10<05:30, 2.47it/s]"
128
+ ]
129
+ },
130
+ {
131
+ "name": "stdout",
132
+ "output_type": "stream",
133
+ "text": [
134
+ "passage: an accurate result that no one can deny. To provide that result, one should follow biological gender since it is <expression>proved </expression> by science. However, gender identity can be hardly proved, and it can be different from the perspectives of individuals. \n",
135
+ "gtruth: -1.0, pred: -1, pred_reason: The use of 'proved' in relation to biological gender is misleading as it implies a mathematical proof rather than scientific evidence.\n",
136
+ "\n"
137
+ ]
138
+ },
139
+ {
140
+ "name": "stderr",
141
+ "output_type": "stream",
142
+ "text": [
143
+ "\n",
144
+ "/Users/ahsan/opt/miniconda3/envs/kwik-ai/lib/python3.12/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n",
145
+ " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
146
+ "/Users/ahsan/opt/miniconda3/envs/kwik-ai/lib/python3.12/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 due to no true samples. Use `zero_division` parameter to control this behavior.\n",
147
+ " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
148
+ "/Users/ahsan/opt/miniconda3/envs/kwik-ai/lib/python3.12/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.\n",
149
+ " _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
150
+ "/Users/ahsan/opt/miniconda3/envs/kwik-ai/lib/python3.12/site-packages/sklearn/metrics/_classification.py:407: UserWarning: A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.\n",
151
+ " warnings.warn(\n"
152
+ ]
153
+ },
154
+ {
155
+ "data": {
156
+ "text/plain": [
157
+ "{'accuracy': 1.0,\n",
158
+ " 'precision': 0.0,\n",
159
+ " 'recall': 0.0,\n",
160
+ " 'f1': 0.0,\n",
161
+ " 'conf_matrix': array([[4]])}"
162
+ ]
163
+ },
164
+ "execution_count": 32,
165
+ "metadata": {},
166
+ "output_type": "execute_result"
167
+ }
168
+ ],
169
+ "source": [
170
+ "client = OpenAI()\n",
171
+ "\n",
172
+ "def run_openai_inference(prompt: str, passage: str):\n",
173
+ " passage_prompt = f\"\"\"\n",
174
+ " Here is the passage you need to analyze:\n",
175
+ " <passage>\n",
176
+ " {passage}\n",
177
+ " </passage>\n",
178
+ "\n",
179
+ " \"\"\"\n",
180
+ "\n",
181
+ "\n",
182
+ " system_prompt = f\"{prompt}\\n\\n{passage_prompt}\"\n",
183
+ "\n",
184
+ " # print(f\"passage_prompt: {system_prompt}\", end=\"\\n\\n\")\n",
185
+ "\n",
186
+ " format = \"\"\"\n",
187
+ " Based on the identified type, extract and return the following data:\n",
188
+ " - score\n",
189
+ "\n",
190
+ " **Output format:**\n",
191
+ " { \"score\": \"return numeric score here\", \"reason\": \"return a short one liner reason for your score here\" }\n",
192
+ " \"\"\"\n",
193
+ "\n",
194
+ " # print(f\"system prompt is: {system_prompt}\")\n",
195
+ "\n",
196
+ " completion = client.chat.completions.create(\n",
197
+ " model=\"gpt-4o-mini\",\n",
198
+ " messages=[\n",
199
+ " {\"role\": \"system\", \"content\": system_prompt},\n",
200
+ " {\"role\": \"user\", \"content\": format},\n",
201
+ " ]\n",
202
+ " )\n",
203
+ "\n",
204
+ " # print(completion.choices[0].message.content)\n",
205
+ " try:\n",
206
+ " score = int(eval(completion.choices[0].message.content)['score'])\n",
207
+ " reason = eval(completion.choices[0].message.content)['reason']\n",
208
+ " except:\n",
209
+ " score = None\n",
210
+ " reason = None\n",
211
+ "\n",
212
+ " return score, reason\n",
213
+ "\n",
214
+ "# Function to process DataFrame and compute predictions\n",
215
+ "def process_dataframe(df, prompt: str, max_iterations: int = 5):\n",
216
+ " df['passage'] = df['LeftContext'] + \" <expression>\" + df['Keyword'] + \"</expression> \" + df['RightContext']\n",
217
+ "\n",
218
+ " i = 0\n",
219
+ " for idx, row in tqdm(df.sample(frac=1.0).iterrows(), total=df.shape[0], desc='Computing AI Outputs'): # randomizing the df\n",
220
+ " try:\n",
221
+ " gtruth = row['Category']\n",
222
+ "\n",
223
+ " # Check if ground truth is NaN\n",
224
+ " if pd.isna(gtruth):\n",
225
+ " raise ValueError('skipping')\n",
226
+ " except:\n",
227
+ " continue\n",
228
+ "\n",
229
+ " pred_score, pred_reason = run_openai_inference(prompt, row['passage'])\n",
230
+ " print(f\"passage: {row['passage']}\")\n",
231
+ " print(f\"gtruth: {int(gtruth)}, pred: {pred_score}, pred_reason: {pred_reason}\", end=\"\\n\\n\")\n",
232
+ " i += 1\n",
233
+ "\n",
234
+ " if i == max_iterations:\n",
235
+ " break\n",
236
+ "\n",
237
+ " df.loc[idx, 'Prediction'] = pred_score\n",
238
+ " df.loc[idx, 'Prediction Reason'] = pred_reason\n",
239
+ "\n",
240
+ " df_out = df.dropna(subset=['Prediction'])\n",
241
+ " return df_out\n",
242
+ "\n",
243
+ "# Ground truth and predictions\n",
244
+ "def evaluate_dataframe(y_true, y_pred):\n",
245
+ " # Compute metrics\n",
246
+ " try:\n",
247
+ " accuracy = accuracy_score(y_true, y_pred)\n",
248
+ " except:\n",
249
+ " accuracy = None\n",
250
+ " \n",
251
+ " try:\n",
252
+ " precision = precision_score(y_true, y_pred, average='binary') # Use 'macro', 'micro', or 'weighted' for multiclass\n",
253
+ " except:\n",
254
+ " precision = None\n",
255
+ "\n",
256
+ " try:\n",
257
+ " recall = recall_score(y_true, y_pred, average='binary')\n",
258
+ " except:\n",
259
+ " recall = None\n",
260
+ "\n",
261
+ " try:\n",
262
+ " f1 = f1_score(y_true, y_pred, average='binary')\n",
263
+ " except:\n",
264
+ " f1 = None\n",
265
+ "\n",
266
+ " try:\n",
267
+ " conf_matrix = confusion_matrix(y_true, y_pred)\n",
268
+ " except:\n",
269
+ " conf_matrix = None\n",
270
+ " \n",
271
+ " return {\n",
272
+ " \"accuracy\": accuracy,\n",
273
+ " \"precision\": precision,\n",
274
+ " \"recall\": recall,\n",
275
+ " \"f1\": f1,\n",
276
+ " \"conf_matrix\": conf_matrix\n",
277
+ " }\n",
278
+ "\n",
279
+ "\n",
280
+ "prompt = f\"\"\"\n",
281
+ "You are a teacher of philosophy of science. \n",
282
+ "I want you to assess whether the use of the expressions \"to prove\", \"proving\", \"proof\", \"proved\", in the provided passages, \n",
283
+ "is appropriate, accurate, and logically consistent from the point of view of the rigorous mathematical meaning of the concept of 'proof'. \n",
284
+ "\n",
285
+ "Consider statements of the type \"X cannot be proved\" or \"there is no proof for X\" as appropriate even when they are applied to something \n",
286
+ "for which the concept of 'proof' is not appropriate. Consider questions involving the concept of 'proof' as appropriate even when they are \n",
287
+ "applied to something for which the concept of 'proof' is not appropriate. You need to respond with a \"+1\" if the use of those \n",
288
+ "expressions is appropriate, accurate, and logically consistent and with a \"-1\" otherwise. Proving something wrong is also acceptable.\n",
289
+ "\"\"\"\n",
290
+ "\n",
291
+ "df = pd.read_excel(\"./data/Quick_and_rough_list_for_Ahsan_initial_trials - 10th Dec 2024.xlsx\", sheet_name='Data')\n",
292
+ "\n",
293
+ "df_out = process_dataframe(df, prompt, max_iterations=5)\n",
294
+ "evaluate_dataframe(df_out['Category'], df_out['Prediction'])"
295
+ ]
296
+ }
297
+ ],
298
+ "metadata": {
299
+ "kernelspec": {
300
+ "display_name": "kwik-ai",
301
+ "language": "python",
302
+ "name": "python3"
303
+ },
304
+ "language_info": {
305
+ "codemirror_mode": {
306
+ "name": "ipython",
307
+ "version": 3
308
+ },
309
+ "file_extension": ".py",
310
+ "mimetype": "text/x-python",
311
+ "name": "python",
312
+ "nbconvert_exporter": "python",
313
+ "pygments_lexer": "ipython3",
314
+ "version": "3.12.8"
315
+ }
316
+ },
317
+ "nbformat": 4,
318
+ "nbformat_minor": 2
319
+ }