Upload 3 files
Browse files
20250803_langextract/extraction_results.jsonl
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"extractions": [{"extraction_class": "depature_date", "extraction_text": "2025/08/05", "char_interval": null, "alignment_status": null, "extraction_index": 1, "group_index": 0, "description": null, "attributes": {}}, {"extraction_class": "arrival_date", "extraction_text": "2025/08/04", "char_interval": null, "alignment_status": null, "extraction_index": 2, "group_index": 1, "description": null, "attributes": {}}, {"extraction_class": "name", "extraction_text": "nakamura john", "char_interval": {"start_pos": 31, "end_pos": 44}, "alignment_status": "match_exact", "extraction_index": 3, "group_index": 2, "description": null, "attributes": {}}, {"extraction_class": "fright_name", "extraction_text": "cx0009", "char_interval": {"start_pos": 55, "end_pos": 61}, "alignment_status": "match_exact", "extraction_index": 4, "group_index": 3, "description": null, "attributes": {}}], "text": "[dat]20250805[dat]20250804[nam]nakamura john[age]30[br]cx0009[fr]ar0520", "document_id": "doc_c6b4f79c"}
|
20250803_langextract/test.ipynb
ADDED
|
@@ -0,0 +1,809 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "3bf0e2df",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# sample test"
|
| 9 |
+
]
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"cell_type": "code",
|
| 13 |
+
"execution_count": 1,
|
| 14 |
+
"id": "eb638e6d",
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"import langextract as lx\n",
|
| 19 |
+
"import textwrap\n",
|
| 20 |
+
"from langextract import inference\n",
|
| 21 |
+
"\n",
|
| 22 |
+
"# 1. Define the prompt and extraction rules\n",
|
| 23 |
+
"prompt = textwrap.dedent(\"\"\"\\\n",
|
| 24 |
+
" Extract characters, emotions, and relationships in order of appearance.\n",
|
| 25 |
+
" Use exact text for extractions. Do not paraphrase or overlap entities.\n",
|
| 26 |
+
" Provide meaningful attributes for each entity to add context.\"\"\")\n",
|
| 27 |
+
"\n",
|
| 28 |
+
"# 2. Provide a high-quality example to guide the model\n",
|
| 29 |
+
"examples = [\n",
|
| 30 |
+
" lx.data.ExampleData(\n",
|
| 31 |
+
" text=\"ROMEO. But soft! What light through yonder window breaks? It is the east, and Juliet is the sun.\",\n",
|
| 32 |
+
" extractions=[\n",
|
| 33 |
+
" lx.data.Extraction(\n",
|
| 34 |
+
" extraction_class=\"character\",\n",
|
| 35 |
+
" extraction_text=\"ROMEO\",\n",
|
| 36 |
+
" attributes={\"emotional_state\": \"wonder\"}\n",
|
| 37 |
+
" ),\n",
|
| 38 |
+
" lx.data.Extraction(\n",
|
| 39 |
+
" extraction_class=\"emotion\",\n",
|
| 40 |
+
" extraction_text=\"But soft!\",\n",
|
| 41 |
+
" attributes={\"feeling\": \"gentle awe\"}\n",
|
| 42 |
+
" ),\n",
|
| 43 |
+
" lx.data.Extraction(\n",
|
| 44 |
+
" extraction_class=\"relationship\",\n",
|
| 45 |
+
" extraction_text=\"Juliet is the sun\",\n",
|
| 46 |
+
" attributes={\"type\": \"metaphor\"}\n",
|
| 47 |
+
" ),\n",
|
| 48 |
+
" ]\n",
|
| 49 |
+
" )\n",
|
| 50 |
+
"]"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"cell_type": "code",
|
| 55 |
+
"execution_count": 2,
|
| 56 |
+
"id": "c72822d9",
|
| 57 |
+
"metadata": {},
|
| 58 |
+
"outputs": [
|
| 59 |
+
{
|
| 60 |
+
"name": "stderr",
|
| 61 |
+
"output_type": "stream",
|
| 62 |
+
"text": [
|
| 63 |
+
"\u001b[94m\u001b[1mLangExtract\u001b[0m: Processing, current=\u001b[92m68\u001b[0m chars, processed=\u001b[92m68\u001b[0m chars: [00:11]"
|
| 64 |
+
]
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"name": "stdout",
|
| 68 |
+
"output_type": "stream",
|
| 69 |
+
"text": [
|
| 70 |
+
"\u001b[92m✓\u001b[0m Extraction processing complete\n",
|
| 71 |
+
"\u001b[92m✓\u001b[0m Extracted \u001b[1m3\u001b[0m entities (\u001b[1m3\u001b[0m unique types)\n",
|
| 72 |
+
" \u001b[96m•\u001b[0m Time: \u001b[1m11.09s\u001b[0m\n",
|
| 73 |
+
" \u001b[96m•\u001b[0m Speed: \u001b[1m6\u001b[0m chars/sec\n",
|
| 74 |
+
" \u001b[96m•\u001b[0m Chunks: \u001b[1m1\u001b[0m\n"
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"name": "stderr",
|
| 79 |
+
"output_type": "stream",
|
| 80 |
+
"text": [
|
| 81 |
+
"\n"
|
| 82 |
+
]
|
| 83 |
+
}
|
| 84 |
+
],
|
| 85 |
+
"source": [
|
| 86 |
+
"# The input text to be processed\n",
|
| 87 |
+
"input_text = \"Lady Juliet gazed longingly at the stars, her heart aching for Romeo\"\n",
|
| 88 |
+
"\n",
|
| 89 |
+
"# Run the extraction\n",
|
| 90 |
+
"result = lx.extract(\n",
|
| 91 |
+
" text_or_documents=input_text,\n",
|
| 92 |
+
" prompt_description=prompt,\n",
|
| 93 |
+
" examples=examples,\n",
|
| 94 |
+
" language_model_type=inference.OllamaLanguageModel,\n",
|
| 95 |
+
" model_id=\"gemma2:latest\",\n",
|
| 96 |
+
" model_url=\"http://localhost:11434\"\n",
|
| 97 |
+
")"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"cell_type": "code",
|
| 102 |
+
"execution_count": 9,
|
| 103 |
+
"id": "a0c64fc9",
|
| 104 |
+
"metadata": {},
|
| 105 |
+
"outputs": [
|
| 106 |
+
{
|
| 107 |
+
"name": "stdout",
|
| 108 |
+
"output_type": "stream",
|
| 109 |
+
"text": [
|
| 110 |
+
"\u001b[31mType:\u001b[39m AnnotatedDocument\n",
|
| 111 |
+
"\u001b[31mString form:\u001b[39m AnnotatedDocument(extractions=[Extraction(extraction_class='character', extraction_text='Lady Jul <...> ={'type': 'love'})], text='Lady Juliet gazed longingly at the stars, her heart aching for Romeo')\n",
|
| 112 |
+
"\u001b[31mFile:\u001b[39m c:\\users\\kenta\\appdata\\local\\programs\\python\\python312\\lib\\site-packages\\langextract\\data.py\n",
|
| 113 |
+
"\u001b[31mDocstring:\u001b[39m \n",
|
| 114 |
+
"Class for representing annotated documents.\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"Attributes:\n",
|
| 117 |
+
" document_id: Unique identifier for each document - autogenerated if not\n",
|
| 118 |
+
" set.\n",
|
| 119 |
+
" extractions: List of extractions in the document.\n",
|
| 120 |
+
" text: Raw text representation of the document.\n",
|
| 121 |
+
" tokenized_text: Tokenized text of the document, computed from `text`."
|
| 122 |
+
]
|
| 123 |
+
}
|
| 124 |
+
],
|
| 125 |
+
"source": [
|
| 126 |
+
"?result"
|
| 127 |
+
]
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"cell_type": "code",
|
| 131 |
+
"execution_count": 12,
|
| 132 |
+
"id": "af83d97e",
|
| 133 |
+
"metadata": {},
|
| 134 |
+
"outputs": [
|
| 135 |
+
{
|
| 136 |
+
"data": {
|
| 137 |
+
"text/plain": [
|
| 138 |
+
"[Extraction(extraction_class='character', extraction_text='Lady Juliet', char_interval=CharInterval(start_pos=0, end_pos=11), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=1, group_index=0, description=None, attributes={'emotional_state': 'longing'}),\n",
|
| 139 |
+
" Extraction(extraction_class='emotion', extraction_text='aching', char_interval=CharInterval(start_pos=52, end_pos=58), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=2, group_index=1, description=None, attributes={'feeling': 'sorrow'}),\n",
|
| 140 |
+
" Extraction(extraction_class='relationship', extraction_text='Lady Juliet... for Romeo', char_interval=CharInterval(start_pos=0, end_pos=68), alignment_status=<AlignmentStatus.MATCH_FUZZY: 'match_fuzzy'>, extraction_index=3, group_index=2, description=None, attributes={'type': 'love'})]"
|
| 141 |
+
]
|
| 142 |
+
},
|
| 143 |
+
"execution_count": 12,
|
| 144 |
+
"metadata": {},
|
| 145 |
+
"output_type": "execute_result"
|
| 146 |
+
}
|
| 147 |
+
],
|
| 148 |
+
"source": [
|
| 149 |
+
"result.extractions"
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"cell_type": "code",
|
| 154 |
+
"execution_count": null,
|
| 155 |
+
"id": "aadaf861",
|
| 156 |
+
"metadata": {},
|
| 157 |
+
"outputs": [],
|
| 158 |
+
"source": []
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"cell_type": "code",
|
| 162 |
+
"execution_count": 10,
|
| 163 |
+
"id": "3622840e",
|
| 164 |
+
"metadata": {},
|
| 165 |
+
"outputs": [
|
| 166 |
+
{
|
| 167 |
+
"name": "stderr",
|
| 168 |
+
"output_type": "stream",
|
| 169 |
+
"text": [
|
| 170 |
+
"\u001b[94m\u001b[1mLangExtract\u001b[0m: Saving to \u001b[92mextraction_results.jsonl\u001b[0m: 1 docs [00:00, 501.95 docs/s]"
|
| 171 |
+
]
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"name": "stdout",
|
| 175 |
+
"output_type": "stream",
|
| 176 |
+
"text": [
|
| 177 |
+
"\u001b[92m✓\u001b[0m Saved \u001b[1m1\u001b[0m documents to \u001b[92mextraction_results.jsonl\u001b[0m\n"
|
| 178 |
+
]
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"name": "stderr",
|
| 182 |
+
"output_type": "stream",
|
| 183 |
+
"text": [
|
| 184 |
+
"\n",
|
| 185 |
+
"\u001b[94m\u001b[1mLangExtract\u001b[0m: Loading \u001b[92mextraction_results.jsonl\u001b[0m: 100%|█████████▉| 918/919 [00:00<00:00, 230kB/s]"
|
| 186 |
+
]
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"name": "stdout",
|
| 190 |
+
"output_type": "stream",
|
| 191 |
+
"text": [
|
| 192 |
+
"\u001b[92m✓\u001b[0m Loaded \u001b[1m1\u001b[0m documents from \u001b[92mextraction_results.jsonl\u001b[0m\n"
|
| 193 |
+
]
|
| 194 |
+
},
|
| 195 |
+
{
|
| 196 |
+
"name": "stderr",
|
| 197 |
+
"output_type": "stream",
|
| 198 |
+
"text": [
|
| 199 |
+
"\n"
|
| 200 |
+
]
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"data": {
|
| 204 |
+
"text/plain": [
|
| 205 |
+
"7947"
|
| 206 |
+
]
|
| 207 |
+
},
|
| 208 |
+
"execution_count": 10,
|
| 209 |
+
"metadata": {},
|
| 210 |
+
"output_type": "execute_result"
|
| 211 |
+
}
|
| 212 |
+
],
|
| 213 |
+
"source": [
|
| 214 |
+
"# Save the results to a JSONL file\n",
|
| 215 |
+
"from pathlib import Path\n",
|
| 216 |
+
"lx.io.save_annotated_documents([result], output_name=\"extraction_results.jsonl\", output_dir=Path(\".\"))\n",
|
| 217 |
+
"\n",
|
| 218 |
+
"# Generate the visualization from the file\n",
|
| 219 |
+
"html_content = lx.visualize(\"extraction_results.jsonl\")\n",
|
| 220 |
+
"# HTML 本体文字列を取得してファイル化\n",
|
| 221 |
+
"html_str: str = html_content.data # HTML 文字列が .data に格納されている\n",
|
| 222 |
+
"output_path = Path(\"visualization.html\")\n",
|
| 223 |
+
"output_path.write_text(html_str, encoding=\"utf-8\")"
|
| 224 |
+
]
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"cell_type": "code",
|
| 228 |
+
"execution_count": 8,
|
| 229 |
+
"id": "16c245f1",
|
| 230 |
+
"metadata": {},
|
| 231 |
+
"outputs": [
|
| 232 |
+
{
|
| 233 |
+
"data": {
|
| 234 |
+
"text/html": [
|
| 235 |
+
"<style>\n",
|
| 236 |
+
".lx-highlight { position: relative; border-radius:3px; padding:1px 2px;}\n",
|
| 237 |
+
".lx-highlight .lx-tooltip {\n",
|
| 238 |
+
" visibility: hidden;\n",
|
| 239 |
+
" opacity: 0;\n",
|
| 240 |
+
" transition: opacity 0.2s ease-in-out;\n",
|
| 241 |
+
" background: #333;\n",
|
| 242 |
+
" color: #fff;\n",
|
| 243 |
+
" text-align: left;\n",
|
| 244 |
+
" border-radius: 4px;\n",
|
| 245 |
+
" padding: 6px 8px;\n",
|
| 246 |
+
" position: absolute;\n",
|
| 247 |
+
" z-index: 1000;\n",
|
| 248 |
+
" bottom: 125%;\n",
|
| 249 |
+
" left: 50%;\n",
|
| 250 |
+
" transform: translateX(-50%);\n",
|
| 251 |
+
" font-size: 12px;\n",
|
| 252 |
+
" max-width: 240px;\n",
|
| 253 |
+
" white-space: normal;\n",
|
| 254 |
+
" box-shadow: 0 2px 6px rgba(0,0,0,0.3);\n",
|
| 255 |
+
"}\n",
|
| 256 |
+
".lx-highlight:hover .lx-tooltip { visibility: visible; opacity:1; }\n",
|
| 257 |
+
".lx-animated-wrapper { max-width: 100%; font-family: Arial, sans-serif; }\n",
|
| 258 |
+
".lx-controls {\n",
|
| 259 |
+
" background: #fafafa; border: 1px solid #90caf9; border-radius: 8px;\n",
|
| 260 |
+
" padding: 12px; margin-bottom: 16px;\n",
|
| 261 |
+
"}\n",
|
| 262 |
+
".lx-button-row {\n",
|
| 263 |
+
" display: flex; justify-content: center; gap: 8px; margin-bottom: 12px;\n",
|
| 264 |
+
"}\n",
|
| 265 |
+
".lx-control-btn {\n",
|
| 266 |
+
" background: #4285f4; color: white; border: none; border-radius: 4px;\n",
|
| 267 |
+
" padding: 8px 16px; cursor: pointer; font-size: 13px; font-weight: 500;\n",
|
| 268 |
+
" transition: background-color 0.2s;\n",
|
| 269 |
+
"}\n",
|
| 270 |
+
".lx-control-btn:hover { background: #3367d6; }\n",
|
| 271 |
+
".lx-progress-container {\n",
|
| 272 |
+
" margin-bottom: 8px;\n",
|
| 273 |
+
"}\n",
|
| 274 |
+
".lx-progress-slider {\n",
|
| 275 |
+
" width: 100%; margin: 0; appearance: none; height: 6px;\n",
|
| 276 |
+
" background: #ddd; border-radius: 3px; outline: none;\n",
|
| 277 |
+
"}\n",
|
| 278 |
+
".lx-progress-slider::-webkit-slider-thumb {\n",
|
| 279 |
+
" appearance: none; width: 18px; height: 18px; background: #4285f4;\n",
|
| 280 |
+
" border-radius: 50%; cursor: pointer;\n",
|
| 281 |
+
"}\n",
|
| 282 |
+
".lx-progress-slider::-moz-range-thumb {\n",
|
| 283 |
+
" width: 18px; height: 18px; background: #4285f4; border-radius: 50%;\n",
|
| 284 |
+
" cursor: pointer; border: none;\n",
|
| 285 |
+
"}\n",
|
| 286 |
+
".lx-status-text {\n",
|
| 287 |
+
" text-align: center; font-size: 12px; color: #666; margin-top: 4px;\n",
|
| 288 |
+
"}\n",
|
| 289 |
+
".lx-text-window {\n",
|
| 290 |
+
" font-family: monospace; white-space: pre-wrap; border: 1px solid #90caf9;\n",
|
| 291 |
+
" padding: 12px; max-height: 260px; overflow-y: auto; margin-bottom: 12px;\n",
|
| 292 |
+
" line-height: 1.6;\n",
|
| 293 |
+
"}\n",
|
| 294 |
+
".lx-attributes-panel {\n",
|
| 295 |
+
" background: #fafafa; border: 1px solid #90caf9; border-radius: 6px;\n",
|
| 296 |
+
" padding: 8px 10px; margin-top: 8px; font-size: 13px;\n",
|
| 297 |
+
"}\n",
|
| 298 |
+
".lx-current-highlight {\n",
|
| 299 |
+
" text-decoration: underline;\n",
|
| 300 |
+
" text-decoration-color: #ff4444;\n",
|
| 301 |
+
" text-decoration-thickness: 3px;\n",
|
| 302 |
+
" font-weight: bold;\n",
|
| 303 |
+
" animation: lx-pulse 1s ease-in-out;\n",
|
| 304 |
+
"}\n",
|
| 305 |
+
"@keyframes lx-pulse {\n",
|
| 306 |
+
" 0% { text-decoration-color: #ff4444; }\n",
|
| 307 |
+
" 50% { text-decoration-color: #ff0000; }\n",
|
| 308 |
+
" 100% { text-decoration-color: #ff4444; }\n",
|
| 309 |
+
"}\n",
|
| 310 |
+
".lx-legend {\n",
|
| 311 |
+
" font-size: 12px; margin-bottom: 8px;\n",
|
| 312 |
+
" padding-bottom: 8px; border-bottom: 1px solid #e0e0e0;\n",
|
| 313 |
+
"}\n",
|
| 314 |
+
".lx-label {\n",
|
| 315 |
+
" display: inline-block;\n",
|
| 316 |
+
" padding: 2px 4px;\n",
|
| 317 |
+
" border-radius: 3px;\n",
|
| 318 |
+
" margin-right: 4px;\n",
|
| 319 |
+
" color: #000;\n",
|
| 320 |
+
"}\n",
|
| 321 |
+
".lx-attr-key {\n",
|
| 322 |
+
" font-weight: 600;\n",
|
| 323 |
+
" color: #1565c0;\n",
|
| 324 |
+
" letter-spacing: 0.3px;\n",
|
| 325 |
+
"}\n",
|
| 326 |
+
".lx-attr-value {\n",
|
| 327 |
+
" font-weight: 400;\n",
|
| 328 |
+
" opacity: 0.85;\n",
|
| 329 |
+
" letter-spacing: 0.2px;\n",
|
| 330 |
+
"}\n",
|
| 331 |
+
"\n",
|
| 332 |
+
"/* Add optimizations with larger fonts and better readability for GIFs */\n",
|
| 333 |
+
".lx-gif-optimized .lx-text-window { font-size: 16px; line-height: 1.8; }\n",
|
| 334 |
+
".lx-gif-optimized .lx-attributes-panel { font-size: 15px; }\n",
|
| 335 |
+
".lx-gif-optimized .lx-current-highlight { text-decoration-thickness: 4px; }\n",
|
| 336 |
+
"</style>\n",
|
| 337 |
+
"<div class=\"lx-animated-wrapper lx-gif-optimized\">\n",
|
| 338 |
+
" <div class=\"lx-attributes-panel\">\n",
|
| 339 |
+
" <div class=\"lx-legend\">Highlights Legend: <span class=\"lx-label\" style=\"background-color:#D2E3FC;\">character</span> <span class=\"lx-label\" style=\"background-color:#C8E6C9;\">emotion</span> <span class=\"lx-label\" style=\"background-color:#FEF0C3;\">relationship</span></div>\n",
|
| 340 |
+
" <div id=\"attributesContainer\"></div>\n",
|
| 341 |
+
" </div>\n",
|
| 342 |
+
" <div class=\"lx-text-window\" id=\"textWindow\">\n",
|
| 343 |
+
" <span class=\"lx-highlight lx-current-highlight\" data-idx=\"0\" style=\"background-color:#FEF0C3;\"><span class=\"lx-highlight\" data-idx=\"1\" style=\"background-color:#D2E3FC;\">Lady Juliet</span> gazed longingly at the stars, her heart <span class=\"lx-highlight\" data-idx=\"2\" style=\"background-color:#C8E6C9;\">aching</span> for Romeo</span>\n",
|
| 344 |
+
" </div>\n",
|
| 345 |
+
" <div class=\"lx-controls\">\n",
|
| 346 |
+
" <div class=\"lx-button-row\">\n",
|
| 347 |
+
" <button class=\"lx-control-btn\" onclick=\"playPause()\">▶️ Play</button>\n",
|
| 348 |
+
" <button class=\"lx-control-btn\" onclick=\"prevExtraction()\">⏮ Previous</button>\n",
|
| 349 |
+
" <button class=\"lx-control-btn\" onclick=\"nextExtraction()\">⏭ Next</button>\n",
|
| 350 |
+
" </div>\n",
|
| 351 |
+
" <div class=\"lx-progress-container\">\n",
|
| 352 |
+
" <input type=\"range\" id=\"progressSlider\" class=\"lx-progress-slider\"\n",
|
| 353 |
+
" min=\"0\" max=\"2\" value=\"0\"\n",
|
| 354 |
+
" onchange=\"jumpToExtraction(this.value)\">\n",
|
| 355 |
+
" </div>\n",
|
| 356 |
+
" <div class=\"lx-status-text\">\n",
|
| 357 |
+
" Entity <span id=\"entityInfo\">1/3</span> |\n",
|
| 358 |
+
" Pos <span id=\"posInfo\">[0-11]</span>\n",
|
| 359 |
+
" </div>\n",
|
| 360 |
+
" </div>\n",
|
| 361 |
+
"</div>\n",
|
| 362 |
+
"\n",
|
| 363 |
+
"<script>\n",
|
| 364 |
+
" (function() {\n",
|
| 365 |
+
" const extractions = [{\"index\": 0, \"class\": \"relationship\", \"text\": \"Lady Juliet and Romeo\", \"color\": \"#FEF0C3\", \"startPos\": 0, \"endPos\": 68, \"beforeText\": \"\", \"extractionText\": \"Lady Juliet gazed longingly at the stars, her heart aching for Romeo\", \"afterText\": \"\", \"attributesHtml\": \"<div><strong>class:</strong> relationship</div><div><strong>attributes:</strong> {<span class=\\\"lx-attr-key\\\">type</span>: <span class=\\\"lx-attr-value\\\">romantic love</span>}</div>\"}, {\"index\": 1, \"class\": \"character\", \"text\": \"Lady Juliet\", \"color\": \"#D2E3FC\", \"startPos\": 0, \"endPos\": 11, \"beforeText\": \"\", \"extractionText\": \"Lady Juliet\", \"afterText\": \" gazed longingly at the stars, her heart aching for Romeo\", \"attributesHtml\": \"<div><strong>class:</strong> character</div><div><strong>attributes:</strong> {<span class=\\\"lx-attr-key\\\">emotional_state</span>: <span class=\\\"lx-attr-value\\\">longing</span>}</div>\"}, {\"index\": 2, \"class\": \"emotion\", \"text\": \"aching\", \"color\": \"#C8E6C9\", \"startPos\": 52, \"endPos\": 58, \"beforeText\": \"Lady Juliet gazed longingly at the stars, her heart \", \"extractionText\": \"aching\", \"afterText\": \" for Romeo\", \"attributesHtml\": \"<div><strong>class:</strong> emotion</div><div><strong>attributes:</strong> {<span class=\\\"lx-attr-key\\\">feeling</span>: <span class=\\\"lx-attr-value\\\">sorrowful desire</span>}</div>\"}];\n",
|
| 366 |
+
" let currentIndex = 0;\n",
|
| 367 |
+
" let isPlaying = false;\n",
|
| 368 |
+
" let animationInterval = null;\n",
|
| 369 |
+
" let animationSpeed = 1.0;\n",
|
| 370 |
+
"\n",
|
| 371 |
+
" function updateDisplay() {\n",
|
| 372 |
+
" const extraction = extractions[currentIndex];\n",
|
| 373 |
+
" if (!extraction) return;\n",
|
| 374 |
+
"\n",
|
| 375 |
+
" document.getElementById('attributesContainer').innerHTML = extraction.attributesHtml;\n",
|
| 376 |
+
" document.getElementById('entityInfo').textContent = (currentIndex + 1) + '/' + extractions.length;\n",
|
| 377 |
+
" document.getElementById('posInfo').textContent = '[' + extraction.startPos + '-' + extraction.endPos + ']';\n",
|
| 378 |
+
" document.getElementById('progressSlider').value = currentIndex;\n",
|
| 379 |
+
"\n",
|
| 380 |
+
" const playBtn = document.querySelector('.lx-control-btn');\n",
|
| 381 |
+
" if (playBtn) playBtn.textContent = isPlaying ? '⏸ Pause' : '▶️ Play';\n",
|
| 382 |
+
"\n",
|
| 383 |
+
" const prevHighlight = document.querySelector('.lx-text-window .lx-current-highlight');\n",
|
| 384 |
+
" if (prevHighlight) prevHighlight.classList.remove('lx-current-highlight');\n",
|
| 385 |
+
" const currentSpan = document.querySelector('.lx-text-window span[data-idx=\"' + currentIndex + '\"]');\n",
|
| 386 |
+
" if (currentSpan) {\n",
|
| 387 |
+
" currentSpan.classList.add('lx-current-highlight');\n",
|
| 388 |
+
" currentSpan.scrollIntoView({block: 'center', behavior: 'smooth'});\n",
|
| 389 |
+
" }\n",
|
| 390 |
+
" }\n",
|
| 391 |
+
"\n",
|
| 392 |
+
" function nextExtraction() {\n",
|
| 393 |
+
" currentIndex = (currentIndex + 1) % extractions.length;\n",
|
| 394 |
+
" updateDisplay();\n",
|
| 395 |
+
" }\n",
|
| 396 |
+
"\n",
|
| 397 |
+
" function prevExtraction() {\n",
|
| 398 |
+
" currentIndex = (currentIndex - 1 + extractions.length) % extractions.length;\n",
|
| 399 |
+
" updateDisplay();\n",
|
| 400 |
+
" }\n",
|
| 401 |
+
"\n",
|
| 402 |
+
" function jumpToExtraction(index) {\n",
|
| 403 |
+
" currentIndex = parseInt(index);\n",
|
| 404 |
+
" updateDisplay();\n",
|
| 405 |
+
" }\n",
|
| 406 |
+
"\n",
|
| 407 |
+
" function playPause() {\n",
|
| 408 |
+
" if (isPlaying) {\n",
|
| 409 |
+
" clearInterval(animationInterval);\n",
|
| 410 |
+
" isPlaying = false;\n",
|
| 411 |
+
" } else {\n",
|
| 412 |
+
" animationInterval = setInterval(nextExtraction, animationSpeed * 1000);\n",
|
| 413 |
+
" isPlaying = true;\n",
|
| 414 |
+
" }\n",
|
| 415 |
+
" updateDisplay();\n",
|
| 416 |
+
" }\n",
|
| 417 |
+
"\n",
|
| 418 |
+
" window.playPause = playPause;\n",
|
| 419 |
+
" window.nextExtraction = nextExtraction;\n",
|
| 420 |
+
" window.prevExtraction = prevExtraction;\n",
|
| 421 |
+
" window.jumpToExtraction = jumpToExtraction;\n",
|
| 422 |
+
"\n",
|
| 423 |
+
" updateDisplay();\n",
|
| 424 |
+
" })();\n",
|
| 425 |
+
"</script>"
|
| 426 |
+
],
|
| 427 |
+
"text/plain": [
|
| 428 |
+
"<IPython.core.display.HTML object>"
|
| 429 |
+
]
|
| 430 |
+
},
|
| 431 |
+
"execution_count": 8,
|
| 432 |
+
"metadata": {},
|
| 433 |
+
"output_type": "execute_result"
|
| 434 |
+
}
|
| 435 |
+
],
|
| 436 |
+
"source": [
|
| 437 |
+
"html_content"
|
| 438 |
+
]
|
| 439 |
+
},
|
| 440 |
+
{
|
| 441 |
+
"cell_type": "markdown",
|
| 442 |
+
"id": "49ec5f64",
|
| 443 |
+
"metadata": {},
|
| 444 |
+
"source": [
|
| 445 |
+
"# My test"
|
| 446 |
+
]
|
| 447 |
+
},
|
| 448 |
+
{
|
| 449 |
+
"cell_type": "code",
|
| 450 |
+
"execution_count": 14,
|
| 451 |
+
"id": "2314fae3",
|
| 452 |
+
"metadata": {},
|
| 453 |
+
"outputs": [],
|
| 454 |
+
"source": [
|
| 455 |
+
"import langextract as lx\n",
|
| 456 |
+
"import textwrap"
|
| 457 |
+
]
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"cell_type": "code",
|
| 461 |
+
"execution_count": 36,
|
| 462 |
+
"id": "2a39a1c0",
|
| 463 |
+
"metadata": {},
|
| 464 |
+
"outputs": [],
|
| 465 |
+
"source": [
|
| 466 |
+
"# 1. Define the prompt and extraction rules\n",
|
| 467 |
+
"prompt = textwrap.dedent(\"\"\"\\\n",
|
| 468 |
+
" フライトの情報です。データの規則性に従い、データを抽出してください。\n",
|
| 469 |
+
" 抽出は、データの順序を保ち、言い換えやパラフレーズを避けてください。\n",
|
| 470 |
+
" 各エンティティには、意味のある属性を追加してコンテキストを提供してください。\n",
|
| 471 |
+
" 出発日、到着日、フライト名などの情報を抽出してください。その際に日付から考えて、出発、到着の順になるように整合性を確認してください。\"\"\")"
|
| 472 |
+
]
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"cell_type": "code",
|
| 476 |
+
"execution_count": 37,
|
| 477 |
+
"id": "15aa1dd6",
|
| 478 |
+
"metadata": {},
|
| 479 |
+
"outputs": [],
|
| 480 |
+
"source": [
|
| 481 |
+
"# 2. Provide a high-quality example to guide the model\n",
|
| 482 |
+
"examples = [\n",
|
| 483 |
+
" lx.data.ExampleData(\n",
|
| 484 |
+
" text=\"[dat]20250801[nam]taro tanaka[age]20[dat]20250803[fr]cx0520\",\n",
|
| 485 |
+
" extractions=[\n",
|
| 486 |
+
" lx.data.Extraction(\n",
|
| 487 |
+
" extraction_class=\"depature_date\",\n",
|
| 488 |
+
" extraction_text=\"2025/08/01\",\n",
|
| 489 |
+
" ),\n",
|
| 490 |
+
" lx.data.Extraction(\n",
|
| 491 |
+
" extraction_class=\"name\",\n",
|
| 492 |
+
" extraction_text=\"taro tanaka!\",\n",
|
| 493 |
+
" ),\n",
|
| 494 |
+
" lx.data.Extraction(\n",
|
| 495 |
+
" extraction_class=\"arrival_date\",\n",
|
| 496 |
+
" extraction_text=\"2025/08/03\",\n",
|
| 497 |
+
" ),\n",
|
| 498 |
+
" lx.data.Extraction(\n",
|
| 499 |
+
" extraction_class=\"fright_name\",\n",
|
| 500 |
+
" extraction_text=\"cx0520\",\n",
|
| 501 |
+
" ),\n",
|
| 502 |
+
" ]\n",
|
| 503 |
+
" )]"
|
| 504 |
+
]
|
| 505 |
+
},
|
| 506 |
+
{
|
| 507 |
+
"cell_type": "code",
|
| 508 |
+
"execution_count": 38,
|
| 509 |
+
"id": "82f1b2bf",
|
| 510 |
+
"metadata": {},
|
| 511 |
+
"outputs": [
|
| 512 |
+
{
|
| 513 |
+
"name": "stderr",
|
| 514 |
+
"output_type": "stream",
|
| 515 |
+
"text": [
|
| 516 |
+
"\u001b[94m\u001b[1mLangExtract\u001b[0m: Processing, current=\u001b[92m71\u001b[0m chars, processed=\u001b[92m71\u001b[0m chars: [00:09]"
|
| 517 |
+
]
|
| 518 |
+
},
|
| 519 |
+
{
|
| 520 |
+
"name": "stdout",
|
| 521 |
+
"output_type": "stream",
|
| 522 |
+
"text": [
|
| 523 |
+
"\u001b[92m✓\u001b[0m Extraction processing complete\n",
|
| 524 |
+
"\u001b[92m✓\u001b[0m Extracted \u001b[1m4\u001b[0m entities (\u001b[1m4\u001b[0m unique types)\n",
|
| 525 |
+
" \u001b[96m•\u001b[0m Time: \u001b[1m9.18s\u001b[0m\n",
|
| 526 |
+
" \u001b[96m•\u001b[0m Speed: \u001b[1m8\u001b[0m chars/sec\n",
|
| 527 |
+
" \u001b[96m•\u001b[0m Chunks: \u001b[1m1\u001b[0m\n"
|
| 528 |
+
]
|
| 529 |
+
},
|
| 530 |
+
{
|
| 531 |
+
"name": "stderr",
|
| 532 |
+
"output_type": "stream",
|
| 533 |
+
"text": [
|
| 534 |
+
"\n"
|
| 535 |
+
]
|
| 536 |
+
}
|
| 537 |
+
],
|
| 538 |
+
"source": [
|
| 539 |
+
"# The input text to be processed\n",
|
| 540 |
+
"input_text = \"[dat]20250804[nam]nakamura john[age]30[dat]20250805[br]cx0009[fr]ar0520\"\n",
|
| 541 |
+
"input_text = \"[dat]20250805[dat]20250804[nam]nakamura john[age]30[br]cx0009[fr]ar0520\"\n",
|
| 542 |
+
"\n",
|
| 543 |
+
"# Run the extraction\n",
|
| 544 |
+
"result = lx.extract(\n",
|
| 545 |
+
" text_or_documents=input_text,\n",
|
| 546 |
+
" prompt_description=prompt,\n",
|
| 547 |
+
" examples=examples,\n",
|
| 548 |
+
" language_model_type=inference.OllamaLanguageModel,\n",
|
| 549 |
+
" model_id=\"gemma2:latest\",\n",
|
| 550 |
+
" model_url=\"http://localhost:11434\"\n",
|
| 551 |
+
")"
|
| 552 |
+
]
|
| 553 |
+
},
|
| 554 |
+
{
|
| 555 |
+
"cell_type": "code",
|
| 556 |
+
"execution_count": 34,
|
| 557 |
+
"id": "b6d58afe",
|
| 558 |
+
"metadata": {},
|
| 559 |
+
"outputs": [
|
| 560 |
+
{
|
| 561 |
+
"name": "stderr",
|
| 562 |
+
"output_type": "stream",
|
| 563 |
+
"text": [
|
| 564 |
+
"\u001b[94m\u001b[1mLangExtract\u001b[0m: Saving to \u001b[92mextraction_results.jsonl\u001b[0m: 1 docs [00:00, 500.10 docs/s]"
|
| 565 |
+
]
|
| 566 |
+
},
|
| 567 |
+
{
|
| 568 |
+
"name": "stdout",
|
| 569 |
+
"output_type": "stream",
|
| 570 |
+
"text": [
|
| 571 |
+
"\u001b[92m✓\u001b[0m Saved \u001b[1m1\u001b[0m documents to \u001b[92mextraction_results.jsonl\u001b[0m\n"
|
| 572 |
+
]
|
| 573 |
+
},
|
| 574 |
+
{
|
| 575 |
+
"name": "stderr",
|
| 576 |
+
"output_type": "stream",
|
| 577 |
+
"text": [
|
| 578 |
+
"\n",
|
| 579 |
+
"\u001b[94m\u001b[1mLangExtract\u001b[0m: Loading \u001b[92mextraction_results.jsonl\u001b[0m: 100%|█████████▉| 997/998 [00:00<00:00, 994kB/s]"
|
| 580 |
+
]
|
| 581 |
+
},
|
| 582 |
+
{
|
| 583 |
+
"name": "stdout",
|
| 584 |
+
"output_type": "stream",
|
| 585 |
+
"text": [
|
| 586 |
+
"\u001b[92m✓\u001b[0m Loaded \u001b[1m1\u001b[0m documents from \u001b[92mextraction_results.jsonl\u001b[0m\n"
|
| 587 |
+
]
|
| 588 |
+
},
|
| 589 |
+
{
|
| 590 |
+
"name": "stderr",
|
| 591 |
+
"output_type": "stream",
|
| 592 |
+
"text": [
|
| 593 |
+
"\n"
|
| 594 |
+
]
|
| 595 |
+
}
|
| 596 |
+
],
|
| 597 |
+
"source": [
|
| 598 |
+
"# Save the results to a JSONL file\n",
|
| 599 |
+
"from pathlib import Path\n",
|
| 600 |
+
"lx.io.save_annotated_documents([result], output_name=\"extraction_results.jsonl\", output_dir=Path(\".\"))\n",
|
| 601 |
+
"\n",
|
| 602 |
+
"# Generate the visualization from the file\n",
|
| 603 |
+
"html_content = lx.visualize(\"extraction_results.jsonl\")\n"
|
| 604 |
+
]
|
| 605 |
+
},
|
| 606 |
+
{
|
| 607 |
+
"cell_type": "code",
|
| 608 |
+
"execution_count": 27,
|
| 609 |
+
"id": "0d45589e",
|
| 610 |
+
"metadata": {},
|
| 611 |
+
"outputs": [
|
| 612 |
+
{
|
| 613 |
+
"data": {
|
| 614 |
+
"text/plain": [
|
| 615 |
+
"[Extraction(extraction_class='departure_date', extraction_text='2025/08/04', char_interval=None, alignment_status=None, extraction_index=1, group_index=0, description=None, attributes={}),\n",
|
| 616 |
+
" Extraction(extraction_class='name', extraction_text='nakamura john', char_interval=CharInterval(start_pos=18, end_pos=31), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=2, group_index=1, description=None, attributes={}),\n",
|
| 617 |
+
" Extraction(extraction_class='arrival_date', extraction_text='2025/08/05', char_interval=None, alignment_status=None, extraction_index=3, group_index=2, description=None, attributes={}),\n",
|
| 618 |
+
" Extraction(extraction_class='flight_name', extraction_text='cx0009', char_interval=CharInterval(start_pos=55, end_pos=61), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=4, group_index=3, description=None, attributes={'type': 'departure'}),\n",
|
| 619 |
+
" Extraction(extraction_class='flight_name', extraction_text='ar0520', char_interval=CharInterval(start_pos=65, end_pos=71), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=5, group_index=4, description=None, attributes={'type': 'arrival'})]"
|
| 620 |
+
]
|
| 621 |
+
},
|
| 622 |
+
"execution_count": 27,
|
| 623 |
+
"metadata": {},
|
| 624 |
+
"output_type": "execute_result"
|
| 625 |
+
}
|
| 626 |
+
],
|
| 627 |
+
"source": [
|
| 628 |
+
"result.extractions"
|
| 629 |
+
]
|
| 630 |
+
},
|
| 631 |
+
{
|
| 632 |
+
"cell_type": "code",
|
| 633 |
+
"execution_count": null,
|
| 634 |
+
"id": "34459cac",
|
| 635 |
+
"metadata": {},
|
| 636 |
+
"outputs": [
|
| 637 |
+
{
|
| 638 |
+
"data": {
|
| 639 |
+
"text/plain": [
|
| 640 |
+
"[Extraction(extraction_class='depature_date', extraction_text='2025/08/05', char_interval=None, alignment_status=None, extraction_index=1, group_index=0, description=None, attributes={}),\n",
|
| 641 |
+
" Extraction(extraction_class='arrival_date', extraction_text='2025/08/04', char_interval=None, alignment_status=None, extraction_index=2, group_index=1, description=None, attributes={}),\n",
|
| 642 |
+
" Extraction(extraction_class='name', extraction_text='nakamura john', char_interval=CharInterval(start_pos=31, end_pos=44), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=3, group_index=2, description=None, attributes={}),\n",
|
| 643 |
+
" Extraction(extraction_class='fright_name', extraction_text='cx0009', char_interval=CharInterval(start_pos=55, end_pos=61), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=4, group_index=3, description=None, attributes={})]"
|
| 644 |
+
]
|
| 645 |
+
},
|
| 646 |
+
"execution_count": 39,
|
| 647 |
+
"metadata": {},
|
| 648 |
+
"output_type": "execute_result"
|
| 649 |
+
}
|
| 650 |
+
],
|
| 651 |
+
"source": [
|
| 652 |
+
"result.extractions"
|
| 653 |
+
]
|
| 654 |
+
},
|
| 655 |
+
{
|
| 656 |
+
"cell_type": "code",
|
| 657 |
+
"execution_count": null,
|
| 658 |
+
"id": "e2eba844",
|
| 659 |
+
"metadata": {},
|
| 660 |
+
"outputs": [],
|
| 661 |
+
"source": []
|
| 662 |
+
},
|
| 663 |
+
{
|
| 664 |
+
"cell_type": "markdown",
|
| 665 |
+
"id": "14fbd61f",
|
| 666 |
+
"metadata": {},
|
| 667 |
+
"source": [
|
| 668 |
+
"# 階層テスト"
|
| 669 |
+
]
|
| 670 |
+
},
|
| 671 |
+
{
|
| 672 |
+
"cell_type": "code",
|
| 673 |
+
"execution_count": 43,
|
| 674 |
+
"id": "bd3dfda7",
|
| 675 |
+
"metadata": {},
|
| 676 |
+
"outputs": [
|
| 677 |
+
{
|
| 678 |
+
"name": "stderr",
|
| 679 |
+
"output_type": "stream",
|
| 680 |
+
"text": [
|
| 681 |
+
"\u001b[94m\u001b[1mLangExtract\u001b[0m: Processing, current=\u001b[92m40\u001b[0m chars, processed=\u001b[92m40\u001b[0m chars: [00:21]"
|
| 682 |
+
]
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"name": "stdout",
|
| 686 |
+
"output_type": "stream",
|
| 687 |
+
"text": [
|
| 688 |
+
"\u001b[92m✓\u001b[0m Extraction processing complete\n",
|
| 689 |
+
"\u001b[92m✓\u001b[0m Extracted \u001b[1m5\u001b[0m entities (\u001b[1m1\u001b[0m unique types)\n",
|
| 690 |
+
" \u001b[96m•\u001b[0m Time: \u001b[1m21.95s\u001b[0m\n",
|
| 691 |
+
" \u001b[96m•\u001b[0m Speed: \u001b[1m2\u001b[0m chars/sec\n",
|
| 692 |
+
" \u001b[96m•\u001b[0m Chunks: \u001b[1m1\u001b[0m\n",
|
| 693 |
+
"AnnotatedDocument(extractions=[Extraction(extraction_class='heading', extraction_text='第2章:分析', char_interval=CharInterval(start_pos=0, end_pos=6), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=1, group_index=0, description=None, attributes={'level': 1, 'children': ['2.1 データ', '2.2 結果']}), Extraction(extraction_class='heading', extraction_text='2.1 データ', char_interval=CharInterval(start_pos=7, end_pos=14), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=2, group_index=1, description=None, attributes={'level': 2, 'children': ['2.1.1 収集', '2.1.2 前処理']}), Extraction(extraction_class='heading', extraction_text='2.1.1 収集', char_interval=CharInterval(start_pos=15, end_pos=23), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=3, group_index=2, description=None, attributes={'level': 3, 'children': []}), Extraction(extraction_class='heading', extraction_text='2.1.2 前処理', char_interval=CharInterval(start_pos=24, end_pos=33), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=4, group_index=3, description=None, attributes={'level': 3, 'children': []}), Extraction(extraction_class='heading', extraction_text='2.2 結果', char_interval=CharInterval(start_pos=34, end_pos=40), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=5, group_index=4, description=None, attributes={'level': 2, 'children': []})], text='第2章:分析\\n2.1 データ\\n2.1.1 収集\\n2.1.2 前処理\\n2.2 結果')\n"
|
| 694 |
+
]
|
| 695 |
+
},
|
| 696 |
+
{
|
| 697 |
+
"name": "stderr",
|
| 698 |
+
"output_type": "stream",
|
| 699 |
+
"text": [
|
| 700 |
+
"\n"
|
| 701 |
+
]
|
| 702 |
+
}
|
| 703 |
+
],
|
| 704 |
+
"source": [
|
| 705 |
+
"import langextract as lx\n",
|
| 706 |
+
"import textwrap\n",
|
| 707 |
+
"\n",
|
| 708 |
+
"prompt = textwrap.dedent(\"\"\"\\\n",
|
| 709 |
+
"以下の文章から「見出し階層」を抽出してください。\n",
|
| 710 |
+
"各階層は JSON にネストされた children リストで表現します。\n",
|
| 711 |
+
"出力の構造を見本にならって厳密に守ってください。\"\"\")\n",
|
| 712 |
+
"\n",
|
| 713 |
+
"# ツリー構造の具体例\n",
|
| 714 |
+
"examples = [\n",
|
| 715 |
+
" # 単一階層\n",
|
| 716 |
+
" lx.data.ExampleData(\n",
|
| 717 |
+
" text=\"第1章:概要\",\n",
|
| 718 |
+
" extractions=[\n",
|
| 719 |
+
" lx.data.Extraction(\n",
|
| 720 |
+
" extraction_class=\"heading\",\n",
|
| 721 |
+
" extraction_text=\"第1章:概要\",\n",
|
| 722 |
+
" attributes={\"level\": 1, \"children\": []}\n",
|
| 723 |
+
" )\n",
|
| 724 |
+
" ]\n",
|
| 725 |
+
" ),\n",
|
| 726 |
+
" # 2階層あり\n",
|
| 727 |
+
" lx.data.ExampleData(\n",
|
| 728 |
+
" text=\"第1章:概要\\n1.1 背景\\n1.2 目的\",\n",
|
| 729 |
+
" extractions=[\n",
|
| 730 |
+
" lx.data.Extraction(\"heading\",\n",
|
| 731 |
+
" \"第1章:概要\",\n",
|
| 732 |
+
" attributes={\"level\": 1, \"children\": [\"1.1 背景\", \"1.2 目的\"]}),\n",
|
| 733 |
+
" lx.data.Extraction(\"heading\", \"1.1 背景\", attributes={\"level\": 2, \"children\": []}),\n",
|
| 734 |
+
" lx.data.Extraction(\"heading\", \"1.2 目的\", attributes={\"level\": 2, \"children\": []}),\n",
|
| 735 |
+
" ]\n",
|
| 736 |
+
" ),\n",
|
| 737 |
+
" # 3階層と子無しケース\n",
|
| 738 |
+
" lx.data.ExampleData(\n",
|
| 739 |
+
" text=\"第2章:分析\\n2.1 データ\\n2.1.1 収集\\n2.1.2 前処理\\n2.2 結果\",\n",
|
| 740 |
+
" extractions=[\n",
|
| 741 |
+
" lx.data.Extraction(\"heading\", \"第2章:分析\", attributes={\"level\": 1, \"children\": [\"2.1 データ\", \"2.2 結果\"]}),\n",
|
| 742 |
+
" lx.data.Extraction(\"heading\", \"2.1 データ\", attributes={\"level\": 2, \"children\": [\"2.1.1 収集\", \"2.1.2 前処理\"]}),\n",
|
| 743 |
+
" lx.data.Extraction(\"heading\", \"2.1.1 収集\", attributes={\"level\": 3, \"children\": []}),\n",
|
| 744 |
+
" lx.data.Extraction(\"heading\", \"2.1.2 前処理\", attributes={\"level\": 3, \"children\": []}),\n",
|
| 745 |
+
" lx.data.Extraction(\"heading\", \"2.2 結果\", attributes={\"level\": 2, \"children\": []}),\n",
|
| 746 |
+
" ]\n",
|
| 747 |
+
" )\n",
|
| 748 |
+
"]\n",
|
| 749 |
+
"\n",
|
| 750 |
+
"result = lx.extract(\n",
|
| 751 |
+
" text_or_documents=\"第2章:分析\\n2.1 データ\\n2.1.1 収集\\n2.1.2 前処理\\n2.2 結果\",\n",
|
| 752 |
+
" prompt_description=prompt,\n",
|
| 753 |
+
" examples=examples,\n",
|
| 754 |
+
" language_model_type=inference.OllamaLanguageModel,\n",
|
| 755 |
+
" model_id=\"gemma2:latest\",\n",
|
| 756 |
+
" model_url=\"http://localhost:11434\"\n",
|
| 757 |
+
")\n",
|
| 758 |
+
"\n",
|
| 759 |
+
"print(result)\n"
|
| 760 |
+
]
|
| 761 |
+
},
|
| 762 |
+
{
|
| 763 |
+
"cell_type": "code",
|
| 764 |
+
"execution_count": 45,
|
| 765 |
+
"id": "d355c87a",
|
| 766 |
+
"metadata": {},
|
| 767 |
+
"outputs": [
|
| 768 |
+
{
|
| 769 |
+
"data": {
|
| 770 |
+
"text/plain": [
|
| 771 |
+
"[Extraction(extraction_class='heading', extraction_text='第2章:分析', char_interval=CharInterval(start_pos=0, end_pos=6), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=1, group_index=0, description=None, attributes={'level': 1, 'children': ['2.1 データ', '2.2 結果']}),\n",
|
| 772 |
+
" Extraction(extraction_class='heading', extraction_text='2.1 データ', char_interval=CharInterval(start_pos=7, end_pos=14), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=2, group_index=1, description=None, attributes={'level': 2, 'children': ['2.1.1 収集', '2.1.2 前処理']}),\n",
|
| 773 |
+
" Extraction(extraction_class='heading', extraction_text='2.1.1 収集', char_interval=CharInterval(start_pos=15, end_pos=23), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=3, group_index=2, description=None, attributes={'level': 3, 'children': []}),\n",
|
| 774 |
+
" Extraction(extraction_class='heading', extraction_text='2.1.2 前処理', char_interval=CharInterval(start_pos=24, end_pos=33), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=4, group_index=3, description=None, attributes={'level': 3, 'children': []}),\n",
|
| 775 |
+
" Extraction(extraction_class='heading', extraction_text='2.2 結果', char_interval=CharInterval(start_pos=34, end_pos=40), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=5, group_index=4, description=None, attributes={'level': 2, 'children': []})]"
|
| 776 |
+
]
|
| 777 |
+
},
|
| 778 |
+
"execution_count": 45,
|
| 779 |
+
"metadata": {},
|
| 780 |
+
"output_type": "execute_result"
|
| 781 |
+
}
|
| 782 |
+
],
|
| 783 |
+
"source": [
|
| 784 |
+
"result.extractions"
|
| 785 |
+
]
|
| 786 |
+
}
|
| 787 |
+
],
|
| 788 |
+
"metadata": {
|
| 789 |
+
"kernelspec": {
|
| 790 |
+
"display_name": "Python 3",
|
| 791 |
+
"language": "python",
|
| 792 |
+
"name": "python3"
|
| 793 |
+
},
|
| 794 |
+
"language_info": {
|
| 795 |
+
"codemirror_mode": {
|
| 796 |
+
"name": "ipython",
|
| 797 |
+
"version": 3
|
| 798 |
+
},
|
| 799 |
+
"file_extension": ".py",
|
| 800 |
+
"mimetype": "text/x-python",
|
| 801 |
+
"name": "python",
|
| 802 |
+
"nbconvert_exporter": "python",
|
| 803 |
+
"pygments_lexer": "ipython3",
|
| 804 |
+
"version": "3.12.9"
|
| 805 |
+
}
|
| 806 |
+
},
|
| 807 |
+
"nbformat": 4,
|
| 808 |
+
"nbformat_minor": 5
|
| 809 |
+
}
|
20250803_langextract/visualization.html
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<style>
|
| 2 |
+
.lx-highlight { position: relative; border-radius:3px; padding:1px 2px;}
|
| 3 |
+
.lx-highlight .lx-tooltip {
|
| 4 |
+
visibility: hidden;
|
| 5 |
+
opacity: 0;
|
| 6 |
+
transition: opacity 0.2s ease-in-out;
|
| 7 |
+
background: #333;
|
| 8 |
+
color: #fff;
|
| 9 |
+
text-align: left;
|
| 10 |
+
border-radius: 4px;
|
| 11 |
+
padding: 6px 8px;
|
| 12 |
+
position: absolute;
|
| 13 |
+
z-index: 1000;
|
| 14 |
+
bottom: 125%;
|
| 15 |
+
left: 50%;
|
| 16 |
+
transform: translateX(-50%);
|
| 17 |
+
font-size: 12px;
|
| 18 |
+
max-width: 240px;
|
| 19 |
+
white-space: normal;
|
| 20 |
+
box-shadow: 0 2px 6px rgba(0,0,0,0.3);
|
| 21 |
+
}
|
| 22 |
+
.lx-highlight:hover .lx-tooltip { visibility: visible; opacity:1; }
|
| 23 |
+
.lx-animated-wrapper { max-width: 100%; font-family: Arial, sans-serif; }
|
| 24 |
+
.lx-controls {
|
| 25 |
+
background: #fafafa; border: 1px solid #90caf9; border-radius: 8px;
|
| 26 |
+
padding: 12px; margin-bottom: 16px;
|
| 27 |
+
}
|
| 28 |
+
.lx-button-row {
|
| 29 |
+
display: flex; justify-content: center; gap: 8px; margin-bottom: 12px;
|
| 30 |
+
}
|
| 31 |
+
.lx-control-btn {
|
| 32 |
+
background: #4285f4; color: white; border: none; border-radius: 4px;
|
| 33 |
+
padding: 8px 16px; cursor: pointer; font-size: 13px; font-weight: 500;
|
| 34 |
+
transition: background-color 0.2s;
|
| 35 |
+
}
|
| 36 |
+
.lx-control-btn:hover { background: #3367d6; }
|
| 37 |
+
.lx-progress-container {
|
| 38 |
+
margin-bottom: 8px;
|
| 39 |
+
}
|
| 40 |
+
.lx-progress-slider {
|
| 41 |
+
width: 100%; margin: 0; appearance: none; height: 6px;
|
| 42 |
+
background: #ddd; border-radius: 3px; outline: none;
|
| 43 |
+
}
|
| 44 |
+
.lx-progress-slider::-webkit-slider-thumb {
|
| 45 |
+
appearance: none; width: 18px; height: 18px; background: #4285f4;
|
| 46 |
+
border-radius: 50%; cursor: pointer;
|
| 47 |
+
}
|
| 48 |
+
.lx-progress-slider::-moz-range-thumb {
|
| 49 |
+
width: 18px; height: 18px; background: #4285f4; border-radius: 50%;
|
| 50 |
+
cursor: pointer; border: none;
|
| 51 |
+
}
|
| 52 |
+
.lx-status-text {
|
| 53 |
+
text-align: center; font-size: 12px; color: #666; margin-top: 4px;
|
| 54 |
+
}
|
| 55 |
+
.lx-text-window {
|
| 56 |
+
font-family: monospace; white-space: pre-wrap; border: 1px solid #90caf9;
|
| 57 |
+
padding: 12px; max-height: 260px; overflow-y: auto; margin-bottom: 12px;
|
| 58 |
+
line-height: 1.6;
|
| 59 |
+
}
|
| 60 |
+
.lx-attributes-panel {
|
| 61 |
+
background: #fafafa; border: 1px solid #90caf9; border-radius: 6px;
|
| 62 |
+
padding: 8px 10px; margin-top: 8px; font-size: 13px;
|
| 63 |
+
}
|
| 64 |
+
.lx-current-highlight {
|
| 65 |
+
text-decoration: underline;
|
| 66 |
+
text-decoration-color: #ff4444;
|
| 67 |
+
text-decoration-thickness: 3px;
|
| 68 |
+
font-weight: bold;
|
| 69 |
+
animation: lx-pulse 1s ease-in-out;
|
| 70 |
+
}
|
| 71 |
+
@keyframes lx-pulse {
|
| 72 |
+
0% { text-decoration-color: #ff4444; }
|
| 73 |
+
50% { text-decoration-color: #ff0000; }
|
| 74 |
+
100% { text-decoration-color: #ff4444; }
|
| 75 |
+
}
|
| 76 |
+
.lx-legend {
|
| 77 |
+
font-size: 12px; margin-bottom: 8px;
|
| 78 |
+
padding-bottom: 8px; border-bottom: 1px solid #e0e0e0;
|
| 79 |
+
}
|
| 80 |
+
.lx-label {
|
| 81 |
+
display: inline-block;
|
| 82 |
+
padding: 2px 4px;
|
| 83 |
+
border-radius: 3px;
|
| 84 |
+
margin-right: 4px;
|
| 85 |
+
color: #000;
|
| 86 |
+
}
|
| 87 |
+
.lx-attr-key {
|
| 88 |
+
font-weight: 600;
|
| 89 |
+
color: #1565c0;
|
| 90 |
+
letter-spacing: 0.3px;
|
| 91 |
+
}
|
| 92 |
+
.lx-attr-value {
|
| 93 |
+
font-weight: 400;
|
| 94 |
+
opacity: 0.85;
|
| 95 |
+
letter-spacing: 0.2px;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
/* Add optimizations with larger fonts and better readability for GIFs */
|
| 99 |
+
.lx-gif-optimized .lx-text-window { font-size: 16px; line-height: 1.8; }
|
| 100 |
+
.lx-gif-optimized .lx-attributes-panel { font-size: 15px; }
|
| 101 |
+
.lx-gif-optimized .lx-current-highlight { text-decoration-thickness: 4px; }
|
| 102 |
+
</style>
|
| 103 |
+
<div class="lx-animated-wrapper lx-gif-optimized">
|
| 104 |
+
<div class="lx-attributes-panel">
|
| 105 |
+
<div class="lx-legend">Highlights Legend: <span class="lx-label" style="background-color:#D2E3FC;">character</span> <span class="lx-label" style="background-color:#C8E6C9;">emotion</span> <span class="lx-label" style="background-color:#FEF0C3;">relationship</span></div>
|
| 106 |
+
<div id="attributesContainer"></div>
|
| 107 |
+
</div>
|
| 108 |
+
<div class="lx-text-window" id="textWindow">
|
| 109 |
+
<span class="lx-highlight lx-current-highlight" data-idx="0" style="background-color:#FEF0C3;"><span class="lx-highlight" data-idx="1" style="background-color:#D2E3FC;">Lady Juliet</span> gazed longingly at the stars, her heart <span class="lx-highlight" data-idx="2" style="background-color:#C8E6C9;">aching</span> for Romeo</span>
|
| 110 |
+
</div>
|
| 111 |
+
<div class="lx-controls">
|
| 112 |
+
<div class="lx-button-row">
|
| 113 |
+
<button class="lx-control-btn" onclick="playPause()">▶️ Play</button>
|
| 114 |
+
<button class="lx-control-btn" onclick="prevExtraction()">⏮ Previous</button>
|
| 115 |
+
<button class="lx-control-btn" onclick="nextExtraction()">⏭ Next</button>
|
| 116 |
+
</div>
|
| 117 |
+
<div class="lx-progress-container">
|
| 118 |
+
<input type="range" id="progressSlider" class="lx-progress-slider"
|
| 119 |
+
min="0" max="2" value="0"
|
| 120 |
+
onchange="jumpToExtraction(this.value)">
|
| 121 |
+
</div>
|
| 122 |
+
<div class="lx-status-text">
|
| 123 |
+
Entity <span id="entityInfo">1/3</span> |
|
| 124 |
+
Pos <span id="posInfo">[0-11]</span>
|
| 125 |
+
</div>
|
| 126 |
+
</div>
|
| 127 |
+
</div>
|
| 128 |
+
|
| 129 |
+
<script>
|
| 130 |
+
(function() {
|
| 131 |
+
const extractions = [{"index": 0, "class": "relationship", "text": "Lady Juliet and Romeo", "color": "#FEF0C3", "startPos": 0, "endPos": 68, "beforeText": "", "extractionText": "Lady Juliet gazed longingly at the stars, her heart aching for Romeo", "afterText": "", "attributesHtml": "<div><strong>class:</strong> relationship</div><div><strong>attributes:</strong> {<span class=\"lx-attr-key\">type</span>: <span class=\"lx-attr-value\">romantic love</span>}</div>"}, {"index": 1, "class": "character", "text": "Lady Juliet", "color": "#D2E3FC", "startPos": 0, "endPos": 11, "beforeText": "", "extractionText": "Lady Juliet", "afterText": " gazed longingly at the stars, her heart aching for Romeo", "attributesHtml": "<div><strong>class:</strong> character</div><div><strong>attributes:</strong> {<span class=\"lx-attr-key\">emotional_state</span>: <span class=\"lx-attr-value\">longing</span>}</div>"}, {"index": 2, "class": "emotion", "text": "aching", "color": "#C8E6C9", "startPos": 52, "endPos": 58, "beforeText": "Lady Juliet gazed longingly at the stars, her heart ", "extractionText": "aching", "afterText": " for Romeo", "attributesHtml": "<div><strong>class:</strong> emotion</div><div><strong>attributes:</strong> {<span class=\"lx-attr-key\">feeling</span>: <span class=\"lx-attr-value\">sorrowful desire</span>}</div>"}];
|
| 132 |
+
let currentIndex = 0;
|
| 133 |
+
let isPlaying = false;
|
| 134 |
+
let animationInterval = null;
|
| 135 |
+
let animationSpeed = 1.0;
|
| 136 |
+
|
| 137 |
+
function updateDisplay() {
|
| 138 |
+
const extraction = extractions[currentIndex];
|
| 139 |
+
if (!extraction) return;
|
| 140 |
+
|
| 141 |
+
document.getElementById('attributesContainer').innerHTML = extraction.attributesHtml;
|
| 142 |
+
document.getElementById('entityInfo').textContent = (currentIndex + 1) + '/' + extractions.length;
|
| 143 |
+
document.getElementById('posInfo').textContent = '[' + extraction.startPos + '-' + extraction.endPos + ']';
|
| 144 |
+
document.getElementById('progressSlider').value = currentIndex;
|
| 145 |
+
|
| 146 |
+
const playBtn = document.querySelector('.lx-control-btn');
|
| 147 |
+
if (playBtn) playBtn.textContent = isPlaying ? '⏸ Pause' : '▶️ Play';
|
| 148 |
+
|
| 149 |
+
const prevHighlight = document.querySelector('.lx-text-window .lx-current-highlight');
|
| 150 |
+
if (prevHighlight) prevHighlight.classList.remove('lx-current-highlight');
|
| 151 |
+
const currentSpan = document.querySelector('.lx-text-window span[data-idx="' + currentIndex + '"]');
|
| 152 |
+
if (currentSpan) {
|
| 153 |
+
currentSpan.classList.add('lx-current-highlight');
|
| 154 |
+
currentSpan.scrollIntoView({block: 'center', behavior: 'smooth'});
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
function nextExtraction() {
|
| 159 |
+
currentIndex = (currentIndex + 1) % extractions.length;
|
| 160 |
+
updateDisplay();
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
function prevExtraction() {
|
| 164 |
+
currentIndex = (currentIndex - 1 + extractions.length) % extractions.length;
|
| 165 |
+
updateDisplay();
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
function jumpToExtraction(index) {
|
| 169 |
+
currentIndex = parseInt(index);
|
| 170 |
+
updateDisplay();
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
function playPause() {
|
| 174 |
+
if (isPlaying) {
|
| 175 |
+
clearInterval(animationInterval);
|
| 176 |
+
isPlaying = false;
|
| 177 |
+
} else {
|
| 178 |
+
animationInterval = setInterval(nextExtraction, animationSpeed * 1000);
|
| 179 |
+
isPlaying = true;
|
| 180 |
+
}
|
| 181 |
+
updateDisplay();
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
window.playPause = playPause;
|
| 185 |
+
window.nextExtraction = nextExtraction;
|
| 186 |
+
window.prevExtraction = prevExtraction;
|
| 187 |
+
window.jumpToExtraction = jumpToExtraction;
|
| 188 |
+
|
| 189 |
+
updateDisplay();
|
| 190 |
+
})();
|
| 191 |
+
</script>
|