Spaces:
Sleeping
Sleeping
File size: 7,633 Bytes
8c27221 c1bb714 8c27221 c1bb714 8c27221 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"id": "1b0c87d2",
"metadata": {},
"outputs": [],
"source": [
"import json \n",
"with open('metadata.jsonl', 'r') as f: \n",
" json_list = list(f)\n",
"\n",
"json_QA = []\n",
"for json_str in json_list: \n",
" json_data = json.loads(json_str)\n",
" json_QA.append(json_data)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "06ef3470",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==================================================\n",
"Task ID: 305ac316-eef6-4446-960a-92d80d542f82\n",
"Question: Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.\n",
"Level: 1\n",
"Final Answer: Wojciech\n",
"Annotator Metadata: \n",
" βββ Steps: \n",
" β βββ 1. Search \"Polish-language version of Everybody Loves Raymond\" and pull up the Wiki page for Wszyscy kochajΔ
Romana.\n",
" β βββ 2. See that BartΕomiej Kasprzykowski is marked as playing Ray and go to his Wiki page.\n",
" β βββ 3. See that he is stated to have played Wojciech PΕaska in Magda M.\n",
" βββ Number of steps: 3\n",
" βββ How long did this take?: 5 minutes\n",
" βββ Tools:\n",
" β βββ None\n",
" βββ Number of tools: 0\n",
"==================================================\n"
]
}
],
"source": [
"import random\n",
"random_samples = random.sample(json_QA, 1)\n",
"for sample in random_samples:\n",
" print(\"=\" * 50)\n",
" print(f\"Task ID: {sample['task_id']}\")\n",
" print(f\"Question: {sample['Question']}\")\n",
" print(f\"Level: {sample['Level']}\")\n",
" print(f\"Final Answer: {sample['Final answer']}\")\n",
" print(f\"Annotator Metadata: \")\n",
" print(f\" βββ Steps: \")\n",
" for step in sample['Annotator Metadata']['Steps'].split('\\n'):\n",
" print(f\" β βββ {step}\")\n",
" print(f\" βββ Number of steps: {sample['Annotator Metadata']['Number of steps']}\")\n",
" print(f\" βββ How long did this take?: {sample['Annotator Metadata']['How long did this take?']}\")\n",
" print(f\" βββ Tools:\")\n",
" for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
" print(f\" β βββ {tool}\")\n",
" print(f\" βββ Number of tools: {sample['Annotator Metadata']['Number of tools']}\")\n",
"print(\"=\" * 50)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "468f83d4",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from dotenv import load_dotenv\n",
"from langchain_huggingface import HuggingFaceEmbeddings\n",
"from langchain_community.vectorstores import SupabaseVectorStore\n",
"from supabase.client import Client, create_client\n",
"\n",
"\n",
"load_dotenv()\n",
"embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\") # dim=768\n",
"\n",
"supabase_url = os.environ.get(\"SUPABASE_URL\")\n",
"supabase_key = os.environ.get(\"SUPABASE_SERVICE_KEY\")\n",
"supabase: Client = create_client(supabase_url, supabase_key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c76e4a1",
"metadata": {},
"outputs": [],
"source": [
"from langchain.schema import Document\n",
"docs = []\n",
"cnt = 0 \n",
"for sample in json_QA:\n",
" content = f\"Question : {sample['Question']}\\n\\nFinal answer : {sample['Final answer']}\"\n",
" doc = {\n",
" \"id\" : cnt,\n",
" \"content\" : content,\n",
" \"metadata\" : {\n",
" \"source\" : sample['task_id']\n",
" },\n",
" \"embedding\" : embeddings.embed_query(content),\n",
" }\n",
" docs.append(doc)\n",
" cnt += 1\n",
"\n",
"#print(f\"sample document: {docs[0]}\")\n",
"# upload the documents to the vector database\n",
"try:\n",
" response = (\n",
" supabase.table(\"documents\")\n",
" .insert(docs)\n",
" .execute()\n",
" )\n",
"except Exception as exception:\n",
" print(\"Error inserting data into Supabase:\", exception)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "1d6fa354",
"metadata": {},
"outputs": [],
"source": [
"# add items to vector database\n",
"vector_store = SupabaseVectorStore(\n",
" client=supabase,\n",
" embedding= embeddings,\n",
" table_name=\"documents\",\n",
" query_name=\"match_documents_langchain\",\n",
")\n",
"retriever = vector_store.as_retriever()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "aed648be",
"metadata": {},
"outputs": [],
"source": [
"query = \"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\"\n",
"#matched_docs = vector_store.similarity_search(query, k=2)\n",
"docs = retriever.invoke(query)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "a70e6e83",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d'}, page_content='Question : On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\\n\\nFinal answer : 80GSFC21M0002')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87c3bf86",
"metadata": {},
"outputs": [],
"source": [
"# list of the tools used in all the samples\n",
"from collections import Counter, OrderedDict\n",
"\n",
"tools = []\n",
"for sample in json_QA:\n",
" for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
" tool = tool[2:].strip().lower()\n",
" if tool.startswith(\"(\"):\n",
" tool = tool[11:].strip()\n",
" tools.append(tool)\n",
"tools_counter = OrderedDict(Counter(tools))\n",
"print(\"List of tools used in all samples:\")\n",
"print(\"Total number of tools used:\", len(tools_counter))\n",
"for tool, count in tools_counter.items():\n",
" print(f\" βββ {tool}: {count}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "hf-cert",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|