{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "6137a317", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dotenv import load_dotenv\n", "load_dotenv()" ] }, { "cell_type": "code", "execution_count": 3, "id": "4a7dc84b", "metadata": {}, "outputs": [], "source": [ "import nest_asyncio\n", "nest_asyncio.apply()" ] }, { "cell_type": "code", "execution_count": 4, "id": "a47086ec", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/media/data/BaiTap/Code/Nam4/DLLUD/Lab03/big-data-application/packages/data_prep/.venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import os\n", "import datasets\n", "from llama_index.core import PropertyGraphIndex, Document, Settings, load_index_from_storage, VectorStoreIndex\n", "from llama_index.core.graph_stores import SimplePropertyGraphStore\n", "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", "from llama_index.llms.openai_like import OpenAILike" ] }, { "cell_type": "code", "execution_count": 5, "id": "28b17221", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n" ] } ], "source": [ "og_data = datasets.load_dataset(\"gamino/wiki_medical_terms\", split=\"train[:20]\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "ca0aa288", "metadata": {}, "outputs": [], "source": [ "data = [\n", " Document(id_=str(idx), text=text) for idx, text in enumerate(og_data[\"page_text\"])\n", "]" ] }, { "cell_type": "code", "execution_count": 7, "id": "53a4d694", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "20" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(data)" ] }, { "cell_type": "code", "execution_count": 14, "id": "e08d931b", "metadata": {}, "outputs": [], "source": [ "llm = OpenAILike(\n", " model=\"DeepSeek-V3.2\",\n", " api_key=os.getenv(\"AZURE_API_KEY\"),\n", " api_base=\"https://thong-api-hub.openai.azure.com/openai/v1\",\n", " is_chat_model=True,\n", " timeout=300,\n", " is_function_calling_model=True, \n", " should_use_structured_outputs=True,\n", ")\n", "Settings.llm = llm" ] }, { "cell_type": "code", "execution_count": 20, "id": "8d7d9e4a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "CompletionResponse(text='䜠奜😊 埈高兎见到䜠\\n\\n有什么我可以垮助䜠的吗无论是回答问题、聊倩、协助解决问题还是其他任䜕事情我郜埈乐意䞺䜠提䟛垮助。请随时告诉我䜠需芁什么', additional_kwargs={'prompt_tokens': 5, 'completion_tokens': 41, 'total_tokens': 46}, raw=ChatCompletion(id='48c590e60d6e472da59fddacdeda1cd2', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='䜠奜😊 埈高兎见到䜠\\n\\n有什么我可以垮助䜠的吗无论是回答问题、聊倩、协助解决问题还是其他任䜕事情我郜埈乐意䞺䜠提䟛垮助。请随时告诉我䜠需芁什么', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None, reasoning_content=None), content_filter_results={'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}, stop_reason=None)], created=1777086170, model='deepseek-v3.2', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=41, prompt_tokens=5, total_tokens=46, completion_tokens_details=None, prompt_tokens_details=None, audio_prompt_tokens=0, reasoning_tokens=0), prompt_filter_results=[{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'jailbreak': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}]), logprobs=None, delta=None)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Settings.llm.complete(\"Hello\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "711fd553", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/media/data/BaiTap/Code/Nam4/DLLUD/Lab03/big-data-application/packages/data_prep/.venv/lib/python3.13/site-packages/torch/cuda/__init__.py:180: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:119.)\n", " return torch._C._cuda_getDeviceCount() > 0\n", "Loading weights: 100%|██████████| 199/199 [00:00<00:00, 2822.69it/s]\n", "\u001b[1mBertModel LOAD REPORT\u001b[0m from: BAAI/bge-small-en-v1.5\n", "Key | Status | | \n", "------------------------+------------+--+-\n", "embeddings.position_ids | UNEXPECTED | | \n", "\n", "Notes:\n", "- UNEXPECTED:\tcan be ignored when loading from different task/architecture; not ok if you expect identical arch.\n" ] } ], "source": [ "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")\n", "\n", "Settings.embed_model = embed_model" ] }, { "cell_type": "code", "execution_count": null, "id": "a5b54280", "metadata": {}, "outputs": [], "source": [ "from llama_index.core.indices.property_graph import DynamicLLMPathExtractor, ImplicitPathExtractor\n", "\n", "kg_extractor = DynamicLLMPathExtractor(\n", " llm=llm,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "6403ff0b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Applying transformations: 100%|██████████| 1/1 [00:00<00:00, 5.57it/s]\n", "Applying transformations: 100%|██████████| 1/1 [02:28<00:00, 148.94s/it]\n", "Generating embeddings: 100%|██████████| 11/11 [00:13<00:00, 1.24s/it]\n", "Generating embeddings: 100%|██████████| 210/210 [00:24<00:00, 8.57it/s]\n" ] } ], "source": [ "graph_store = SimplePropertyGraphStore()\n", "\n", "index = PropertyGraphIndex.from_documents(\n", " data,\n", " show_progress=True,\n", " property_graph_store=graph_store,\n", " llm=llm,\n", " embed_model=embed_model,\n", " kg_extractors=[kg_extractor, ImplicitPathExtractor()],\n", ")" ] }, { "cell_type": "code", "execution_count": 23, "id": "b5a54ce9", "metadata": {}, "outputs": [], "source": [ "index.property_graph_store.save_networkx_graph(name=\"./kg.html\")" ] }, { "cell_type": "code", "execution_count": null, "id": "abedc381", "metadata": {}, "outputs": [], "source": [ "index.storage_context.persist(\"./generated\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 5 }