ratnakar09 commited on
Commit
299a1c6
Β·
verified Β·
1 Parent(s): 34ff83d

Delete explore_metadata.ipynb

Browse files
Files changed (1) hide show
  1. explore_metadata.ipynb +0 -211
explore_metadata.ipynb DELETED
@@ -1,211 +0,0 @@
1
- import json
2
- with open('metadata.jsonl', 'r') as f:
3
- json_list = list(f)
4
-
5
- json_QA = []
6
- for json_str in json_list:
7
- json_data = json.loads(json_str)
8
- json_QA.append(json_data)
9
- import random
10
- random_samples = random.sample(json_QA, 1)
11
- for sample in random_samples:
12
- print("=" * 50)
13
- print(f"Task ID: {sample['task_id']}")
14
- print(f"Question: {sample['Question']}")
15
- print(f"Level: {sample['Level']}")
16
- print(f"Final Answer: {sample['Final answer']}")
17
- print(f"Annotator Metadata: ")
18
- print(f" β”œβ”€β”€ Steps: ")
19
- for step in sample['Annotator Metadata']['Steps'].split('\n'):
20
- print(f" β”‚ β”œβ”€β”€ {step}")
21
- print(f" β”œβ”€β”€ Number of steps: {sample['Annotator Metadata']['Number of steps']}")
22
- print(f" β”œβ”€β”€ How long did this take?: {sample['Annotator Metadata']['How long did this take?']}")
23
- print(f" β”œβ”€β”€ Tools:")
24
- for tool in sample['Annotator Metadata']['Tools'].split('\n'):
25
- print(f" β”‚ β”œβ”€β”€ {tool}")
26
- print(f" └── Number of tools: {sample['Annotator Metadata']['Number of tools']}")
27
- print("=" * 50)
28
- ==================================================
29
- Task ID: d1af70ea-a9a4-421a-b9cc-94b5e02f1788
30
- Question: As of the 2020 census, what was the population difference between the largest county seat and smallest county seat, by land area of the county seat, in Washington state? For population figures, please use the official data from data.census.gov. Please report the integer difference.
31
- Level: 2
32
- Final Answer: 736455
33
- Annotator Metadata:
34
- β”œβ”€β”€ Steps:
35
- β”‚ β”œβ”€β”€ Step 1: Using a web browser, access a search engine and conduct a search, "Washington cities by area"
36
- β”‚ β”œβ”€β”€ Step 2: Navigate to the second search result, https://en.wikipedia.org/wiki/List_of_municipalities_in_Washington
37
- β”‚ β”œβ”€β”€ Step 3: Evaluate the page contents, finding the largest and smallest county seats by land area, Seattle and Cathlamet
38
- β”‚ β”œβ”€β”€ Step 4: Using a web browser, navigate to https://data.census.gov/
39
- β”‚ β”œβ”€β”€ Step 5: Using the website's search area, conduct a search, Seattle, Washington
40
- β”‚ β”œβ”€β”€ Step 6: Record the reported 2020 Decennial Census population of Seattle, Washington, 737,015
41
- β”‚ β”œβ”€β”€ Step 7: Using the website's search area, conduct a search, Cathlamet, Washington
42
- β”‚ β”œβ”€β”€ Step 8: Record the reported 2020 Decennial Census population of Cathlamet, Washington, 560
43
- β”‚ β”œβ”€β”€ Step 9: Using a calculator, find the difference in populations,
44
- β”‚ β”œβ”€β”€
45
- β”‚ β”œβ”€β”€ 737,015 - 560
46
- β”‚ β”œβ”€β”€ 736,455
47
- β”‚ β”œβ”€β”€ Step 10: Report the correct answer to my user in the requested format, "736,455"
48
- β”œβ”€β”€ Number of steps: 10
49
- β”œβ”€β”€ How long did this take?: 5 minutes
50
- β”œβ”€β”€ Tools:
51
- β”‚ β”œβ”€β”€ 1. A web browser
52
- β”‚ β”œβ”€β”€ 2. A search engine
53
- β”‚ β”œβ”€β”€ 3. A calculator
54
- └── Number of tools: 3
55
- ==================================================
56
- import os
57
- from dotenv import load_dotenv
58
- from langchain_huggingface import HuggingFaceEmbeddings
59
- from langchain_community.vectorstores import SupabaseVectorStore
60
- from supabase.client import Client, create_client
61
-
62
-
63
- load_dotenv()
64
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # dim=768
65
-
66
- supabase_url = os.environ.get("SUPABASE_URL")
67
- supabase_key = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
68
- supabase: Client = create_client(supabase_url, supabase_key)
69
- from langchain.schema import Document
70
- docs = []
71
- cnt = 0
72
- for sample in json_QA:
73
- content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
74
- doc = {
75
- "id" : cnt,
76
- "content" : content,
77
- "metadata" : {
78
- "source" : sample['task_id']
79
- },
80
- "embedding" : embeddings.embed_query(content),
81
- }
82
- docs.append(doc)
83
- cnt += 1
84
-
85
- # upload the documents to the vector database
86
- try:
87
- response = (
88
- supabase.table("documents2")
89
- .insert(docs)
90
- .execute()
91
- )
92
- except Exception as exception:
93
- print("Error inserting data into Supabase:", exception)
94
-
95
- # # Save the documents (a list of dict) into a csv file, and manually upload it to Supabase
96
- # import pandas as pd
97
- # df = pd.DataFrame(docs)
98
- # df.to_csv('supabase_docs.csv',index=False)
99
- # add items to vector database
100
- vector_store = SupabaseVectorStore(
101
- client=supabase,
102
- embedding= embeddings,
103
- table_name="documents2",
104
- query_name="match_documents_2",
105
- )
106
- retriever = vector_store.as_retriever()
107
- query = "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?"
108
- # matched_docs = vector_store.similarity_search(query, k=2)
109
- docs = retriever.invoke(query)
110
- docs[0]
111
- Document(metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d'}, page_content='Question : On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\n\nFinal answer : 80GSFC21M0002')
112
- # list of the tools used in all the samples
113
- from collections import Counter, OrderedDict
114
-
115
- tools = []
116
- for sample in json_QA:
117
- for tool in sample['Annotator Metadata']['Tools'].split('\n'):
118
- tool = tool[2:].strip().lower()
119
- if tool.startswith("("):
120
- tool = tool[11:].strip()
121
- tools.append(tool)
122
- tools_counter = OrderedDict(Counter(tools))
123
- print("List of tools used in all samples:")
124
- print("Total number of tools used:", len(tools_counter))
125
- for tool, count in tools_counter.items():
126
- print(f" β”œβ”€β”€ {tool}: {count}")
127
- List of tools used in all samples:
128
- Total number of tools used: 83
129
- β”œβ”€β”€ web browser: 107
130
- β”œβ”€β”€ image recognition tools (to identify and parse a figure with three axes): 1
131
- β”œβ”€β”€ search engine: 101
132
- β”œβ”€β”€ calculator: 34
133
- β”œβ”€β”€ unlambda compiler (optional): 1
134
- β”œβ”€β”€ a web browser.: 2
135
- β”œβ”€β”€ a search engine.: 2
136
- β”œβ”€β”€ a calculator.: 1
137
- β”œβ”€β”€ microsoft excel: 5
138
- β”œβ”€β”€ google search: 1
139
- β”œβ”€β”€ ne: 9
140
- β”œβ”€β”€ pdf access: 7
141
- β”œβ”€β”€ file handling: 2
142
- β”œβ”€β”€ python: 3
143
- β”œβ”€β”€ image recognition tools: 12
144
- β”œβ”€β”€ jsonld file access: 1
145
- β”œβ”€β”€ video parsing: 1
146
- β”œβ”€β”€ python compiler: 1
147
- β”œβ”€β”€ video recognition tools: 3
148
- β”œβ”€β”€ pdf viewer: 7
149
- β”œβ”€β”€ microsoft excel / google sheets: 3
150
- β”œβ”€β”€ word document access: 1
151
- β”œβ”€β”€ tool to extract text from images: 1
152
- β”œβ”€β”€ a word reversal tool / script: 1
153
- β”œβ”€β”€ counter: 1
154
- β”œβ”€β”€ excel: 3
155
- β”œβ”€β”€ image recognition: 5
156
- β”œβ”€β”€ color recognition: 3
157
- β”œβ”€β”€ excel file access: 3
158
- β”œβ”€β”€ xml file access: 1
159
- β”œβ”€β”€ access to the internet archive, web.archive.org: 1
160
- β”œβ”€β”€ text processing/diff tool: 1
161
- β”œβ”€β”€ gif parsing tools: 1
162
- β”œβ”€β”€ a web browser: 7
163
- β”œβ”€β”€ a search engine: 7
164
- β”œβ”€β”€ a speech-to-text tool: 2
165
- β”œβ”€β”€ code/data analysis tools: 1
166
- β”œβ”€β”€ audio capability: 2
167
- β”œβ”€β”€ pdf reader: 1
168
- β”œβ”€β”€ markdown: 1
169
- β”œβ”€β”€ a calculator: 5
170
- β”œβ”€β”€ access to wikipedia: 3
171
- β”œβ”€β”€ image recognition/ocr: 3
172
- β”œβ”€β”€ google translate access: 1
173
- β”œβ”€β”€ ocr: 4
174
- β”œβ”€β”€ bass note data: 1
175
- β”œβ”€β”€ text editor: 1
176
- β”œβ”€β”€ xlsx file access: 1
177
- β”œβ”€β”€ powerpoint viewer: 1
178
- β”œβ”€β”€ csv file access: 1
179
- β”œβ”€β”€ calculator (or use excel): 1
180
- β”œβ”€β”€ computer algebra system: 1
181
- β”œβ”€β”€ video processing software: 1
182
- β”œβ”€β”€ audio processing software: 1
183
- β”œβ”€β”€ computer vision: 1
184
- β”œβ”€β”€ google maps: 1
185
- β”œβ”€β”€ access to excel files: 1
186
- β”œβ”€β”€ calculator (or ability to count): 1
187
- β”œβ”€β”€ a file interface: 3
188
- β”œβ”€β”€ a python ide: 1
189
- β”œβ”€β”€ spreadsheet editor: 1
190
- β”œβ”€β”€ tools required: 1
191
- β”œβ”€β”€ b browser: 1
192
- β”œβ”€β”€ image recognition and processing tools: 1
193
- β”œβ”€β”€ computer vision or ocr: 1
194
- β”œβ”€β”€ c++ compiler: 1
195
- β”œβ”€β”€ access to google maps: 1
196
- β”œβ”€β”€ youtube player: 1
197
- β”œβ”€β”€ natural language processor: 1
198
- β”œβ”€β”€ graph interaction tools: 1
199
- β”œβ”€β”€ bablyonian cuniform -> arabic legend: 1
200
- β”œβ”€β”€ access to youtube: 1
201
- β”œβ”€β”€ image search tools: 1
202
- β”œβ”€β”€ calculator or counting function: 1
203
- β”œβ”€β”€ a speech-to-text audio processing tool: 1
204
- β”œβ”€β”€ access to academic journal websites: 1
205
- β”œβ”€β”€ pdf reader/extracter: 1
206
- β”œβ”€β”€ rubik's cube model: 1
207
- β”œβ”€β”€ wikipedia: 1
208
- β”œβ”€β”€ video capability: 1
209
- β”œβ”€β”€ image processing tools: 1
210
- β”œβ”€β”€ age recognition software: 1
211
- β”œβ”€β”€ youtube: 1