UserSyncUI

Paused

App Files Files Community

UserSyncUI / docs /api /tinytroupe /extraction /normalizer.html

harvesthealth

Upload folder using huggingface_hub

f6686e1 verified 2 months ago

raw

history blame contribute delete

23.4 kB

	<!doctype html>
	<html lang="en">
	<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
	<meta name="generator" content="pdoc 0.10.0" />
	<title>tinytroupe.extraction.normalizer API documentation</title>
	<meta name="description" content="" />
	<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
	<link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
	<link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
	<style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > :last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > {white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
	<style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
	<style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
	<script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
	<script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
	</head>
	<body>
	<main>
	<article id="content">
	<header>
	<h1 class="title">Module <code>tinytroupe.extraction.normalizer</code></h1>
	</header>
	<section id="section-intro">
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">import pandas as pd
	from typing import Union, List

	from tinytroupe.extraction import logger

	from tinytroupe import openai_utils
	import tinytroupe.utils as utils
	class Normalizer:
	"""
	A mechanism to normalize passages, concepts and other textual elements.
	"""

	def __init__(self, elements:List[str], n:int, verbose:bool=False):
	"""
	Normalizes the specified elements.

	Args:
	elements (list): The elements to normalize.
	n (int): The number of normalized elements to output.
	verbose (bool, optional): Whether to print debug messages. Defaults to False.
	"""
	# ensure elements are unique
	self.elements = list(set(elements))

	self.n = n
	self.verbose = verbose

	# a JSON-based structure, where each output element is a key to a list of input elements that were merged into it
	self.normalized_elements = None
	# a dict that maps each input element to its normalized output. This will be used as cache later.
	self.normalizing_map = {}

	rendering_configs = {"n": n,
	"elements": self.elements}

	messages = utils.compose_initial_LLM_messages_with_templates("normalizer.system.mustache", "normalizer.user.mustache",
	base_module_folder="extraction",
	rendering_configs=rendering_configs)

	next_message = openai_utils.client().send_message(messages, temperature=0.1)

	debug_msg = f"Normalization result message: {next_message}"
	logger.debug(debug_msg)
	if self.verbose:
	print(debug_msg)

	result = utils.extract_json(next_message["content"])
	logger.debug(result)
	if self.verbose:
	print(result)

	self.normalized_elements = result


	def normalize(self, element_or_elements:Union[str, List[str]]) -> Union[str, List[str]]:
	"""
	Normalizes the specified element or elements.

	This method uses a caching mechanism to improve performance. If an element has been normalized before,
	its normalized form is stored in a cache (self.normalizing_map). When the same element needs to be
	normalized again, the method will first check the cache and use the stored normalized form if available,
	instead of normalizing the element again.

	The order of elements in the output will be the same as in the input. This is ensured by processing
	the elements in the order they appear in the input and appending the normalized elements to the output
	list in the same order.

	Args:
	element_or_elements (Union[str, List[str]]): The element or elements to normalize.

	Returns:
	str: The normalized element if the input was a string.
	list: The normalized elements if the input was a list, preserving the order of elements in the input.
	"""
	if isinstance(element_or_elements, str):
	denormalized_elements = [element_or_elements]
	elif isinstance(element_or_elements, list):
	denormalized_elements = element_or_elements
	else:
	raise ValueError("The element_or_elements must be either a string or a list.")

	normalized_elements = []
	elements_to_normalize = []
	for element in denormalized_elements:
	if element not in self.normalizing_map:
	elements_to_normalize.append(element)

	if elements_to_normalize:
	rendering_configs = {"categories": self.normalized_elements,
	"elements": elements_to_normalize}

	messages = utils.compose_initial_LLM_messages_with_templates("normalizer.applier.system.mustache", "normalizer.applier.user.mustache",
	base_module_folder="extraction",
	rendering_configs=rendering_configs)

	next_message = openai_utils.client().send_message(messages, temperature=0.1)

	debug_msg = f"Normalization result message: {next_message}"
	logger.debug(debug_msg)
	if self.verbose:
	print(debug_msg)

	normalized_elements_from_llm = utils.extract_json(next_message["content"])
	assert isinstance(normalized_elements_from_llm, list), "The normalized element must be a list."
	assert len(normalized_elements_from_llm) == len(elements_to_normalize), "The number of normalized elements must be equal to the number of elements to normalize."

	for i, element in enumerate(elements_to_normalize):
	normalized_element = normalized_elements_from_llm[i]
	self.normalizing_map[element] = normalized_element

	for element in denormalized_elements:
	normalized_elements.append(self.normalizing_map[element])

	return normalized_elements
	</code></pre>
	</details>
	</section>
	<section>
	</section>
	<section>
	</section>
	<section>
	</section>
	<section>
	<h2 class="section-title" id="header-classes">Classes</h2>
	<dl>
	<dt id="tinytroupe.extraction.normalizer.Normalizer"><code class="flex name class">
	<span>class <span class="ident">Normalizer</span></span>
	<span>(</span><span>elements: List[str], n: int, verbose: bool = False)</span>
	</code></dt>
	<dd>
	<div class="desc"><p>A mechanism to normalize passages, concepts and other textual elements.</p>
	<p>Normalizes the specified elements.</p>
	<h2 id="args">Args</h2>
	<dl>
	<dt><strong><code>elements</code></strong> :&ensp;<code>list</code></dt>
	<dd>The elements to normalize.</dd>
	<dt><strong><code>n</code></strong> :&ensp;<code>int</code></dt>
	<dd>The number of normalized elements to output.</dd>
	<dt><strong><code>verbose</code></strong> :&ensp;<code>bool</code>, optional</dt>
	<dd>Whether to print debug messages. Defaults to False.</dd>
	</dl></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">class Normalizer:
	"""
	A mechanism to normalize passages, concepts and other textual elements.
	"""

	def __init__(self, elements:List[str], n:int, verbose:bool=False):
	"""
	Normalizes the specified elements.

	Args:
	elements (list): The elements to normalize.
	n (int): The number of normalized elements to output.
	verbose (bool, optional): Whether to print debug messages. Defaults to False.
	"""
	# ensure elements are unique
	self.elements = list(set(elements))

	self.n = n
	self.verbose = verbose

	# a JSON-based structure, where each output element is a key to a list of input elements that were merged into it
	self.normalized_elements = None
	# a dict that maps each input element to its normalized output. This will be used as cache later.
	self.normalizing_map = {}

	rendering_configs = {"n": n,
	"elements": self.elements}

	messages = utils.compose_initial_LLM_messages_with_templates("normalizer.system.mustache", "normalizer.user.mustache",
	base_module_folder="extraction",
	rendering_configs=rendering_configs)

	next_message = openai_utils.client().send_message(messages, temperature=0.1)

	debug_msg = f"Normalization result message: {next_message}"
	logger.debug(debug_msg)
	if self.verbose:
	print(debug_msg)

	result = utils.extract_json(next_message["content"])
	logger.debug(result)
	if self.verbose:
	print(result)

	self.normalized_elements = result


	def normalize(self, element_or_elements:Union[str, List[str]]) -> Union[str, List[str]]:
	"""
	Normalizes the specified element or elements.

	This method uses a caching mechanism to improve performance. If an element has been normalized before,
	its normalized form is stored in a cache (self.normalizing_map). When the same element needs to be
	normalized again, the method will first check the cache and use the stored normalized form if available,
	instead of normalizing the element again.

	The order of elements in the output will be the same as in the input. This is ensured by processing
	the elements in the order they appear in the input and appending the normalized elements to the output
	list in the same order.

	Args:
	element_or_elements (Union[str, List[str]]): The element or elements to normalize.

	Returns:
	str: The normalized element if the input was a string.
	list: The normalized elements if the input was a list, preserving the order of elements in the input.
	"""
	if isinstance(element_or_elements, str):
	denormalized_elements = [element_or_elements]
	elif isinstance(element_or_elements, list):
	denormalized_elements = element_or_elements
	else:
	raise ValueError("The element_or_elements must be either a string or a list.")

	normalized_elements = []
	elements_to_normalize = []
	for element in denormalized_elements:
	if element not in self.normalizing_map:
	elements_to_normalize.append(element)

	if elements_to_normalize:
	rendering_configs = {"categories": self.normalized_elements,
	"elements": elements_to_normalize}

	messages = utils.compose_initial_LLM_messages_with_templates("normalizer.applier.system.mustache", "normalizer.applier.user.mustache",
	base_module_folder="extraction",
	rendering_configs=rendering_configs)

	next_message = openai_utils.client().send_message(messages, temperature=0.1)

	debug_msg = f"Normalization result message: {next_message}"
	logger.debug(debug_msg)
	if self.verbose:
	print(debug_msg)

	normalized_elements_from_llm = utils.extract_json(next_message["content"])
	assert isinstance(normalized_elements_from_llm, list), "The normalized element must be a list."
	assert len(normalized_elements_from_llm) == len(elements_to_normalize), "The number of normalized elements must be equal to the number of elements to normalize."

	for i, element in enumerate(elements_to_normalize):
	normalized_element = normalized_elements_from_llm[i]
	self.normalizing_map[element] = normalized_element

	for element in denormalized_elements:
	normalized_elements.append(self.normalizing_map[element])

	return normalized_elements</code></pre>
	</details>
	<h3>Methods</h3>
	<dl>
	<dt id="tinytroupe.extraction.normalizer.Normalizer.normalize"><code class="name flex">
	<span>def <span class="ident">normalize</span></span>(<span>self, element_or_elements: Union[str, List[str]]) ‑> Union[str, List[str]]</span>
	</code></dt>
	<dd>
	<div class="desc"><p>Normalizes the specified element or elements.</p>
	<p>This method uses a caching mechanism to improve performance. If an element has been normalized before,
	its normalized form is stored in a cache (self.normalizing_map). When the same element needs to be
	normalized again, the method will first check the cache and use the stored normalized form if available,
	instead of normalizing the element again.</p>
	<p>The order of elements in the output will be the same as in the input. This is ensured by processing
	the elements in the order they appear in the input and appending the normalized elements to the output
	list in the same order.</p>
	<h2 id="args">Args</h2>
	<dl>
	<dt><strong><code>element_or_elements</code></strong> :&ensp;<code>Union[str, List[str]]</code></dt>
	<dd>The element or elements to normalize.</dd>
	</dl>
	<h2 id="returns">Returns</h2>
	<dl>
	<dt><code>str</code></dt>
	<dd>The normalized element if the input was a string.</dd>
	<dt><code>list</code></dt>
	<dd>The normalized elements if the input was a list, preserving the order of elements in the input.</dd>
	</dl></div>
	<details class="source">
	<summary>
	<span>Expand source code</span>
	</summary>
	<pre><code class="python">def normalize(self, element_or_elements:Union[str, List[str]]) -> Union[str, List[str]]:
	"""
	Normalizes the specified element or elements.

	This method uses a caching mechanism to improve performance. If an element has been normalized before,
	its normalized form is stored in a cache (self.normalizing_map). When the same element needs to be
	normalized again, the method will first check the cache and use the stored normalized form if available,
	instead of normalizing the element again.

	The order of elements in the output will be the same as in the input. This is ensured by processing
	the elements in the order they appear in the input and appending the normalized elements to the output
	list in the same order.

	Args:
	element_or_elements (Union[str, List[str]]): The element or elements to normalize.

	Returns:
	str: The normalized element if the input was a string.
	list: The normalized elements if the input was a list, preserving the order of elements in the input.
	"""
	if isinstance(element_or_elements, str):
	denormalized_elements = [element_or_elements]
	elif isinstance(element_or_elements, list):
	denormalized_elements = element_or_elements
	else:
	raise ValueError("The element_or_elements must be either a string or a list.")

	normalized_elements = []
	elements_to_normalize = []
	for element in denormalized_elements:
	if element not in self.normalizing_map:
	elements_to_normalize.append(element)

	if elements_to_normalize:
	rendering_configs = {"categories": self.normalized_elements,
	"elements": elements_to_normalize}

	messages = utils.compose_initial_LLM_messages_with_templates("normalizer.applier.system.mustache", "normalizer.applier.user.mustache",
	base_module_folder="extraction",
	rendering_configs=rendering_configs)

	next_message = openai_utils.client().send_message(messages, temperature=0.1)

	debug_msg = f"Normalization result message: {next_message}"
	logger.debug(debug_msg)
	if self.verbose:
	print(debug_msg)

	normalized_elements_from_llm = utils.extract_json(next_message["content"])
	assert isinstance(normalized_elements_from_llm, list), "The normalized element must be a list."
	assert len(normalized_elements_from_llm) == len(elements_to_normalize), "The number of normalized elements must be equal to the number of elements to normalize."

	for i, element in enumerate(elements_to_normalize):
	normalized_element = normalized_elements_from_llm[i]
	self.normalizing_map[element] = normalized_element

	for element in denormalized_elements:
	normalized_elements.append(self.normalizing_map[element])

	return normalized_elements</code></pre>
	</details>
	</dd>
	</dl>
	</dd>
	</dl>
	</section>
	</article>
	<nav id="sidebar">
	<h1>Index</h1>
	<div class="toc">
	<ul></ul>
	</div>
	<ul id="index">
	<li><h3>Super-module</h3>
	<ul>
	<li><code><a title="tinytroupe.extraction" href="index.html">tinytroupe.extraction</a></code></li>
	</ul>
	</li>
	<li><h3><a href="#header-classes">Classes</a></h3>
	<ul>
	<li>
	<h4><code><a title="tinytroupe.extraction.normalizer.Normalizer" href="#tinytroupe.extraction.normalizer.Normalizer">Normalizer</a></code></h4>
	<ul class="">
	<li><code><a title="tinytroupe.extraction.normalizer.Normalizer.normalize" href="#tinytroupe.extraction.normalizer.Normalizer.normalize">normalize</a></code></li>
	</ul>
	</li>
	</ul>
	</li>
	</ul>
	</nav>
	</main>
	<footer id="footer">
	<p>Generated by <a href="https://pdoc3.github.io/pdoc" title="pdoc: Python API documentation generator"><cite>pdoc</cite> 0.10.0</a>.</p>
	</footer>
	</body>
	</html>