|
|
--- |
|
|
language: |
|
|
- en |
|
|
license: apache-2.0 |
|
|
tags: |
|
|
- sentence-transformers |
|
|
- sentence-similarity |
|
|
- feature-extraction |
|
|
- dense |
|
|
- generated_from_trainer |
|
|
- dataset_size:180 |
|
|
- loss:MatryoshkaLoss |
|
|
- loss:MultipleNegativesRankingLoss |
|
|
base_model: shubharuidas/codebert-embed-base-dense-retriever |
|
|
widget: |
|
|
- source_sentence: Explain the __init__ logic |
|
|
sentences: |
|
|
- "async def test_handler_with_async_execution() -> None:\n \"\"\"Test handler\ |
|
|
\ works correctly with async tool execution.\"\"\"\n\n @tool\n def async_add(a:\ |
|
|
\ int, b: int) -> int:\n \"\"\"Async add two numbers.\"\"\"\n return\ |
|
|
\ a + b\n\n def modifying_handler(\n request: ToolCallRequest,\n \ |
|
|
\ execute: Callable[[ToolCallRequest], ToolMessage | Command],\n ) -> ToolMessage\ |
|
|
\ | Command:\n \"\"\"Handler that modifies arguments.\"\"\"\n #\ |
|
|
\ Add 10 to both arguments using override method\n modified_call = {\n\ |
|
|
\ **request.tool_call,\n \"args\": {\n **request.tool_call[\"\ |
|
|
args\"],\n \"a\": request.tool_call[\"args\"][\"a\"] + 10,\n \ |
|
|
\ \"b\": request.tool_call[\"args\"][\"b\"] + 10,\n },\n\ |
|
|
\ }\n modified_request = request.override(tool_call=modified_call)\n\ |
|
|
\ return execute(modified_request)\n\n tool_node = ToolNode([async_add],\ |
|
|
\ wrap_tool_call=modifying_handler)\n\n result = await tool_node.ainvoke(\n\ |
|
|
\ {\n \"messages\": [\n AIMessage(\n \ |
|
|
\ \"adding\",\n tool_calls=[\n \ |
|
|
\ {\n \"name\": \"async_add\",\n \ |
|
|
\ \"args\": {\"a\": 1, \"b\": 2},\n \ |
|
|
\ \"id\": \"call_13\",\n }\n ],\n \ |
|
|
\ )\n ]\n },\n config=_create_config_with_runtime(),\n\ |
|
|
\ )\n\n tool_message = result[\"messages\"][-1]\n assert isinstance(tool_message,\ |
|
|
\ ToolMessage)\n # Original: 1 + 2 = 3, with modifications: 11 + 12 = 23\n\ |
|
|
\ assert tool_message.content == \"23\"" |
|
|
- "def __init__(self) -> None:\n self.loads: set[str] = set()\n self.stores:\ |
|
|
\ set[str] = set()" |
|
|
- "class InternalServerError(APIStatusError):\n pass" |
|
|
- source_sentence: Explain the async _load_checkpoint_tuple logic |
|
|
sentences: |
|
|
- 'def task(__func_or_none__: Callable[P, Awaitable[T]]) -> _TaskFunction[P, T]: |
|
|
...' |
|
|
- "class State(BaseModel):\n query: str\n inner: InnerObject\n \ |
|
|
\ answer: str | None = None\n docs: Annotated[list[str], sorted_add]" |
|
|
- "async def _load_checkpoint_tuple(self, value: DictRow) -> CheckpointTuple:\n\ |
|
|
\ \"\"\"\n Convert a database row into a CheckpointTuple object.\n\ |
|
|
\n Args:\n value: A row from the database containing checkpoint\ |
|
|
\ data.\n\n Returns:\n CheckpointTuple: A structured representation\ |
|
|
\ of the checkpoint,\n including its configuration, metadata, parent\ |
|
|
\ checkpoint (if any),\n and pending writes.\n \"\"\"\n \ |
|
|
\ return CheckpointTuple(\n {\n \"configurable\"\ |
|
|
: {\n \"thread_id\": value[\"thread_id\"],\n \ |
|
|
\ \"checkpoint_ns\": value[\"checkpoint_ns\"],\n \"checkpoint_id\"\ |
|
|
: value[\"checkpoint_id\"],\n }\n },\n {\n\ |
|
|
\ **value[\"checkpoint\"],\n \"channel_values\"\ |
|
|
: {\n **(value[\"checkpoint\"].get(\"channel_values\") or {}),\n\ |
|
|
\ **self._load_blobs(value[\"channel_values\"]),\n \ |
|
|
\ },\n },\n value[\"metadata\"],\n (\n\ |
|
|
\ {\n \"configurable\": {\n \ |
|
|
\ \"thread_id\": value[\"thread_id\"],\n \"checkpoint_ns\"\ |
|
|
: value[\"checkpoint_ns\"],\n \"checkpoint_id\": value[\"\ |
|
|
parent_checkpoint_id\"],\n }\n }\n \ |
|
|
\ if value[\"parent_checkpoint_id\"]\n else None\n \ |
|
|
\ ),\n await asyncio.to_thread(self._load_writes, value[\"pending_writes\"\ |
|
|
]),\n )" |
|
|
- source_sentence: Explain the flattened_runs logic |
|
|
sentences: |
|
|
- "class ChannelWrite(RunnableCallable):\n \"\"\"Implements the logic for sending\ |
|
|
\ writes to CONFIG_KEY_SEND.\n Can be used as a runnable or as a static method\ |
|
|
\ to call imperatively.\"\"\"\n\n writes: list[ChannelWriteEntry | ChannelWriteTupleEntry\ |
|
|
\ | Send]\n \"\"\"Sequence of write entries or Send objects to write.\"\"\"\ |
|
|
\n\n def __init__(\n self,\n writes: Sequence[ChannelWriteEntry\ |
|
|
\ | ChannelWriteTupleEntry | Send],\n *,\n tags: Sequence[str] |\ |
|
|
\ None = None,\n ):\n super().__init__(\n func=self._write,\n\ |
|
|
\ afunc=self._awrite,\n name=None,\n tags=tags,\n\ |
|
|
\ trace=False,\n )\n self.writes = cast(\n \ |
|
|
\ list[ChannelWriteEntry | ChannelWriteTupleEntry | Send], writes\n )\n\ |
|
|
\n def get_name(self, suffix: str | None = None, *, name: str | None = None)\ |
|
|
\ -> str:\n if not name:\n name = f\"ChannelWrite<{','.join(w.channel\ |
|
|
\ if isinstance(w, ChannelWriteEntry) else '...' if isinstance(w, ChannelWriteTupleEntry)\ |
|
|
\ else w.node for w in self.writes)}>\"\n return super().get_name(suffix,\ |
|
|
\ name=name)\n\n def _write(self, input: Any, config: RunnableConfig) -> None:\n\ |
|
|
\ writes = [\n ChannelWriteEntry(write.channel, input, write.skip_none,\ |
|
|
\ write.mapper)\n if isinstance(write, ChannelWriteEntry) and write.value\ |
|
|
\ is PASSTHROUGH\n else ChannelWriteTupleEntry(write.mapper, input)\n\ |
|
|
\ if isinstance(write, ChannelWriteTupleEntry) and write.value is PASSTHROUGH\n\ |
|
|
\ else write\n for write in self.writes\n ]\n \ |
|
|
\ self.do_write(\n config,\n writes,\n )\n \ |
|
|
\ return input\n\n async def _awrite(self, input: Any, config: RunnableConfig)\ |
|
|
\ -> None:\n writes = [\n ChannelWriteEntry(write.channel, input,\ |
|
|
\ write.skip_none, write.mapper)\n if isinstance(write, ChannelWriteEntry)\ |
|
|
\ and write.value is PASSTHROUGH\n else ChannelWriteTupleEntry(write.mapper,\ |
|
|
\ input)\n if isinstance(write, ChannelWriteTupleEntry) and write.value\ |
|
|
\ is PASSTHROUGH\n else write\n for write in self.writes\n\ |
|
|
\ ]\n self.do_write(\n config,\n writes,\n\ |
|
|
\ )\n return input\n\n @staticmethod\n def do_write(\n \ |
|
|
\ config: RunnableConfig,\n writes: Sequence[ChannelWriteEntry | ChannelWriteTupleEntry\ |
|
|
\ | Send],\n allow_passthrough: bool = True,\n ) -> None:\n #\ |
|
|
\ validate\n for w in writes:\n if isinstance(w, ChannelWriteEntry):\n\ |
|
|
\ if w.channel == TASKS:\n raise InvalidUpdateError(\n\ |
|
|
\ \"Cannot write to the reserved channel TASKS\"\n \ |
|
|
\ )\n if w.value is PASSTHROUGH and not allow_passthrough:\n\ |
|
|
\ raise InvalidUpdateError(\"PASSTHROUGH value must be replaced\"\ |
|
|
)\n if isinstance(w, ChannelWriteTupleEntry):\n if w.value\ |
|
|
\ is PASSTHROUGH and not allow_passthrough:\n raise InvalidUpdateError(\"\ |
|
|
PASSTHROUGH value must be replaced\")\n # if we want to persist writes\ |
|
|
\ found before hitting a ParentCommand\n # can move this to a finally block\n\ |
|
|
\ write: TYPE_SEND = config[CONF][CONFIG_KEY_SEND]\n write(_assemble_writes(writes))\n\ |
|
|
\n @staticmethod\n def is_writer(runnable: Runnable) -> bool:\n \"\ |
|
|
\"\"Used by PregelNode to distinguish between writers and other runnables.\"\"\ |
|
|
\"\n return (\n isinstance(runnable, ChannelWrite)\n \ |
|
|
\ or getattr(runnable, \"_is_channel_writer\", MISSING) is not MISSING\n \ |
|
|
\ )\n\n @staticmethod\n def get_static_writes(\n runnable:\ |
|
|
\ Runnable,\n ) -> Sequence[tuple[str, Any, str | None]] | None:\n \"\ |
|
|
\"\"Used to get conditional writes a writer declares for static analysis.\"\"\"\ |
|
|
\n if isinstance(runnable, ChannelWrite):\n return [\n \ |
|
|
\ w\n for entry in runnable.writes\n if\ |
|
|
\ isinstance(entry, ChannelWriteTupleEntry) and entry.static\n \ |
|
|
\ for w in entry.static\n ] or None\n elif writes := getattr(runnable,\ |
|
|
\ \"_is_channel_writer\", MISSING):\n if writes is not MISSING:\n \ |
|
|
\ writes = cast(\n Sequence[tuple[ChannelWriteEntry\ |
|
|
\ | Send, str | None]],\n writes,\n )\n \ |
|
|
\ entries = [e for e, _ in writes]\n labels = [la for\ |
|
|
\ _, la in writes]\n return [(*t, la) for t, la in zip(_assemble_writes(entries),\ |
|
|
\ labels)]\n\n @staticmethod\n def register_writer(\n runnable: R,\n\ |
|
|
\ static: Sequence[tuple[ChannelWriteEntry | Send, str | None]] | None\ |
|
|
\ = None,\n ) -> R:\n \"\"\"Used to mark a runnable as a writer, so\ |
|
|
\ that it can be detected by is_writer.\n Instances of ChannelWrite are\ |
|
|
\ automatically marked as writers.\n Optionally, a list of declared writes\ |
|
|
\ can be passed for static analysis.\"\"\"\n # using object.__setattr__\ |
|
|
\ to work around objects that override __setattr__\n # eg. pydantic models\ |
|
|
\ and dataclasses\n object.__setattr__(runnable, \"_is_channel_writer\"\ |
|
|
, static)\n return runnable" |
|
|
- "def test_double_interrupt_subgraph(sync_checkpointer: BaseCheckpointSaver) ->\ |
|
|
\ None:\n class AgentState(TypedDict):\n input: str\n\n def node_1(state:\ |
|
|
\ AgentState):\n result = interrupt(\"interrupt node 1\")\n return\ |
|
|
\ {\"input\": result}\n\n def node_2(state: AgentState):\n result =\ |
|
|
\ interrupt(\"interrupt node 2\")\n return {\"input\": result}\n\n subgraph_builder\ |
|
|
\ = (\n StateGraph(AgentState)\n .add_node(\"node_1\", node_1)\n\ |
|
|
\ .add_node(\"node_2\", node_2)\n .add_edge(START, \"node_1\")\n\ |
|
|
\ .add_edge(\"node_1\", \"node_2\")\n .add_edge(\"node_2\", END)\n\ |
|
|
\ )\n\n # invoke the sub graph\n subgraph = subgraph_builder.compile(checkpointer=sync_checkpointer)\n\ |
|
|
\ thread = {\"configurable\": {\"thread_id\": str(uuid.uuid4())}}\n assert\ |
|
|
\ [c for c in subgraph.stream({\"input\": \"test\"}, thread)] == [\n {\n\ |
|
|
\ \"__interrupt__\": (\n Interrupt(\n \ |
|
|
\ value=\"interrupt node 1\",\n id=AnyStr(),\n \ |
|
|
\ ),\n )\n },\n ]\n # resume from the first interrupt\n\ |
|
|
\ assert [c for c in subgraph.stream(Command(resume=\"123\"), thread)] == [\n\ |
|
|
\ {\n \"node_1\": {\"input\": \"123\"},\n },\n \ |
|
|
\ {\n \"__interrupt__\": (\n Interrupt(\n \ |
|
|
\ value=\"interrupt node 2\",\n id=AnyStr(),\n \ |
|
|
\ ),\n )\n },\n ]\n # resume from the second\ |
|
|
\ interrupt\n assert [c for c in subgraph.stream(Command(resume=\"123\"), thread)]\ |
|
|
\ == [\n {\n \"node_2\": {\"input\": \"123\"},\n },\n\ |
|
|
\ ]\n\n subgraph = subgraph_builder.compile()\n\n def invoke_sub_agent(state:\ |
|
|
\ AgentState):\n return subgraph.invoke(state)\n\n thread = {\"configurable\"\ |
|
|
: {\"thread_id\": str(uuid.uuid4())}}\n parent_agent = (\n StateGraph(AgentState)\n\ |
|
|
\ .add_node(\"invoke_sub_agent\", invoke_sub_agent)\n .add_edge(START,\ |
|
|
\ \"invoke_sub_agent\")\n .add_edge(\"invoke_sub_agent\", END)\n \ |
|
|
\ .compile(checkpointer=sync_checkpointer)\n )\n\n assert [c for c in parent_agent.stream({\"\ |
|
|
input\": \"test\"}, thread)] == [\n {\n \"__interrupt__\": (\n\ |
|
|
\ Interrupt(\n value=\"interrupt node 1\",\n\ |
|
|
\ id=AnyStr(),\n ),\n )\n \ |
|
|
\ },\n ]\n\n # resume from the first interrupt\n assert [c for c in parent_agent.stream(Command(resume=True),\ |
|
|
\ thread)] == [\n {\n \"__interrupt__\": (\n \ |
|
|
\ Interrupt(\n value=\"interrupt node 2\",\n \ |
|
|
\ id=AnyStr(),\n ),\n )\n }\n ]\n\n \ |
|
|
\ # resume from 2nd interrupt\n assert [c for c in parent_agent.stream(Command(resume=True),\ |
|
|
\ thread)] == [\n {\n \"invoke_sub_agent\": {\"input\": True},\n\ |
|
|
\ },\n ]" |
|
|
- "def flattened_runs(self) -> list[Run]:\n q = [] + self.runs\n result\ |
|
|
\ = []\n while q:\n parent = q.pop()\n result.append(parent)\n\ |
|
|
\ if parent.child_runs:\n q.extend(parent.child_runs)\n\ |
|
|
\ return result" |
|
|
- source_sentence: Explain the SubGraphState logic |
|
|
sentences: |
|
|
- "class Cron(TypedDict):\n \"\"\"Represents a scheduled task.\"\"\"\n\n cron_id:\ |
|
|
\ str\n \"\"\"The ID of the cron.\"\"\"\n assistant_id: str\n \"\"\"\ |
|
|
The ID of the assistant.\"\"\"\n thread_id: str | None\n \"\"\"The ID of\ |
|
|
\ the thread.\"\"\"\n on_run_completed: OnCompletionBehavior | None\n \"\ |
|
|
\"\"What to do with the thread after the run completes. Only applicable for stateless\ |
|
|
\ crons.\"\"\"\n end_time: datetime | None\n \"\"\"The end date to stop\ |
|
|
\ running the cron.\"\"\"\n schedule: str\n \"\"\"The schedule to run, cron\ |
|
|
\ format.\"\"\"\n created_at: datetime\n \"\"\"The time the cron was created.\"\ |
|
|
\"\"\n updated_at: datetime\n \"\"\"The last time the cron was updated.\"\ |
|
|
\"\"\n payload: dict\n \"\"\"The run payload to use for creating new run.\"\ |
|
|
\"\"\n user_id: str | None\n \"\"\"The user ID of the cron.\"\"\"\n next_run_date:\ |
|
|
\ datetime | None\n \"\"\"The next run date of the cron.\"\"\"\n metadata:\ |
|
|
\ dict\n \"\"\"The metadata of the cron.\"\"\"" |
|
|
- "class SubGraphState(MessagesState):\n city: str" |
|
|
- "def task_path_str(tup: str | int | tuple) -> str:\n \"\"\"Generate a string\ |
|
|
\ representation of the task path.\"\"\"\n return (\n f\"~{', '.join(task_path_str(x)\ |
|
|
\ for x in tup)}\"\n if isinstance(tup, (tuple, list))\n else f\"\ |
|
|
{tup:010d}\"\n if isinstance(tup, int)\n else str(tup)\n )" |
|
|
- source_sentence: Best practices for test_list_namespaces_operations |
|
|
sentences: |
|
|
- "def test_doubly_nested_graph_state(\n sync_checkpointer: BaseCheckpointSaver,\n\ |
|
|
) -> None:\n class State(TypedDict):\n my_key: str\n\n class ChildState(TypedDict):\n\ |
|
|
\ my_key: str\n\n class GrandChildState(TypedDict):\n my_key:\ |
|
|
\ str\n\n def grandchild_1(state: ChildState):\n return {\"my_key\"\ |
|
|
: state[\"my_key\"] + \" here\"}\n\n def grandchild_2(state: ChildState):\n\ |
|
|
\ return {\n \"my_key\": state[\"my_key\"] + \" and there\"\ |
|
|
,\n }\n\n grandchild = StateGraph(GrandChildState)\n grandchild.add_node(\"\ |
|
|
grandchild_1\", grandchild_1)\n grandchild.add_node(\"grandchild_2\", grandchild_2)\n\ |
|
|
\ grandchild.add_edge(\"grandchild_1\", \"grandchild_2\")\n grandchild.set_entry_point(\"\ |
|
|
grandchild_1\")\n grandchild.set_finish_point(\"grandchild_2\")\n\n child\ |
|
|
\ = StateGraph(ChildState)\n child.add_node(\n \"child_1\",\n \ |
|
|
\ grandchild.compile(interrupt_before=[\"grandchild_2\"]),\n )\n child.set_entry_point(\"\ |
|
|
child_1\")\n child.set_finish_point(\"child_1\")\n\n def parent_1(state:\ |
|
|
\ State):\n return {\"my_key\": \"hi \" + state[\"my_key\"]}\n\n def\ |
|
|
\ parent_2(state: State):\n return {\"my_key\": state[\"my_key\"] + \"\ |
|
|
\ and back again\"}\n\n graph = StateGraph(State)\n graph.add_node(\"parent_1\"\ |
|
|
, parent_1)\n graph.add_node(\"child\", child.compile())\n graph.add_node(\"\ |
|
|
parent_2\", parent_2)\n graph.set_entry_point(\"parent_1\")\n graph.add_edge(\"\ |
|
|
parent_1\", \"child\")\n graph.add_edge(\"child\", \"parent_2\")\n graph.set_finish_point(\"\ |
|
|
parent_2\")\n\n app = graph.compile(checkpointer=sync_checkpointer)\n\n \ |
|
|
\ # test invoke w/ nested interrupt\n config = {\"configurable\": {\"thread_id\"\ |
|
|
: \"1\"}}\n assert [\n c\n for c in app.stream(\n \ |
|
|
\ {\"my_key\": \"my value\"}, config, subgraphs=True, durability=\"exit\"\n \ |
|
|
\ )\n ] == [\n ((), {\"parent_1\": {\"my_key\": \"hi my value\"\ |
|
|
}}),\n (\n (AnyStr(\"child:\"), AnyStr(\"child_1:\")),\n \ |
|
|
\ {\"grandchild_1\": {\"my_key\": \"hi my value here\"}},\n ),\n\ |
|
|
\ ((), {\"__interrupt__\": ()}),\n ]\n # get state without subgraphs\n\ |
|
|
\ outer_state = app.get_state(config)\n assert outer_state == StateSnapshot(\n\ |
|
|
\ values={\"my_key\": \"hi my value\"},\n tasks=(\n PregelTask(\n\ |
|
|
\ AnyStr(),\n \"child\",\n (PULL,\ |
|
|
\ \"child\"),\n state={\n \"configurable\":\ |
|
|
\ {\n \"thread_id\": \"1\",\n \"\ |
|
|
checkpoint_ns\": AnyStr(\"child\"),\n }\n },\n\ |
|
|
\ ),\n ),\n next=(\"child\",),\n config={\n \ |
|
|
\ \"configurable\": {\n \"thread_id\": \"1\",\n \ |
|
|
\ \"checkpoint_ns\": \"\",\n \"checkpoint_id\": AnyStr(),\n\ |
|
|
\ }\n },\n metadata={\n \"parents\": {},\n\ |
|
|
\ \"source\": \"loop\",\n \"step\": 1,\n },\n \ |
|
|
\ created_at=AnyStr(),\n parent_config=None,\n interrupts=(),\n\ |
|
|
\ )\n child_state = app.get_state(outer_state.tasks[0].state)\n assert\ |
|
|
\ child_state == StateSnapshot(\n values={\"my_key\": \"hi my value\"},\n\ |
|
|
\ tasks=(\n PregelTask(\n AnyStr(),\n \ |
|
|
\ \"child_1\",\n (PULL, \"child_1\"),\n \ |
|
|
\ state={\n \"configurable\": {\n \"\ |
|
|
thread_id\": \"1\",\n \"checkpoint_ns\": AnyStr(),\n \ |
|
|
\ }\n },\n ),\n ),\n \ |
|
|
\ next=(\"child_1\",),\n config={\n \"configurable\": {\n \ |
|
|
\ \"thread_id\": \"1\",\n \"checkpoint_ns\": AnyStr(\"\ |
|
|
child:\"),\n \"checkpoint_id\": AnyStr(),\n \"checkpoint_map\"\ |
|
|
: AnyDict(\n {\n \"\": AnyStr(),\n \ |
|
|
\ AnyStr(\"child:\"): AnyStr(),\n }\n\ |
|
|
\ ),\n }\n },\n metadata={\n \ |
|
|
\ \"parents\": {\"\": AnyStr()},\n \"source\": \"loop\",\n \ |
|
|
\ \"step\": 0,\n },\n created_at=AnyStr(),\n parent_config=None,\n\ |
|
|
\ interrupts=(),\n )\n grandchild_state = app.get_state(child_state.tasks[0].state)\n\ |
|
|
\ assert grandchild_state == StateSnapshot(\n values={\"my_key\": \"\ |
|
|
hi my value here\"},\n tasks=(\n PregelTask(\n \ |
|
|
\ AnyStr(),\n \"grandchild_2\",\n (PULL, \"grandchild_2\"\ |
|
|
),\n ),\n ),\n next=(\"grandchild_2\",),\n config={\n\ |
|
|
\ \"configurable\": {\n \"thread_id\": \"1\",\n \ |
|
|
\ \"checkpoint_ns\": AnyStr(),\n \"checkpoint_id\":\ |
|
|
\ AnyStr(),\n \"checkpoint_map\": AnyDict(\n \ |
|
|
\ {\n \"\": AnyStr(),\n AnyStr(\"\ |
|
|
child:\"): AnyStr(),\n AnyStr(re.compile(r\"child:.+|child1:\"\ |
|
|
)): AnyStr(),\n }\n ),\n }\n \ |
|
|
\ },\n metadata={\n \"parents\": AnyDict(\n \ |
|
|
\ {\n \"\": AnyStr(),\n AnyStr(\"child:\"\ |
|
|
): AnyStr(),\n }\n ),\n \"source\": \"loop\"\ |
|
|
,\n \"step\": 1,\n },\n created_at=AnyStr(),\n \ |
|
|
\ parent_config=None,\n interrupts=(),\n )\n # get state with subgraphs\n\ |
|
|
\ assert app.get_state(config, subgraphs=True) == StateSnapshot(\n values={\"\ |
|
|
my_key\": \"hi my value\"},\n tasks=(\n PregelTask(\n \ |
|
|
\ AnyStr(),\n \"child\",\n (PULL, \"child\"\ |
|
|
),\n state=StateSnapshot(\n values={\"my_key\"\ |
|
|
: \"hi my value\"},\n tasks=(\n PregelTask(\n\ |
|
|
\ AnyStr(),\n \"child_1\"\ |
|
|
,\n (PULL, \"child_1\"),\n \ |
|
|
\ state=StateSnapshot(\n values={\"my_key\"\ |
|
|
: \"hi my value here\"},\n tasks=(\n \ |
|
|
\ PregelTask(\n \ |
|
|
\ AnyStr(),\n \"grandchild_2\",\n \ |
|
|
\ (PULL, \"grandchild_2\"),\n \ |
|
|
\ ),\n ),\n \ |
|
|
\ next=(\"grandchild_2\",),\n \ |
|
|
\ config={\n \"configurable\": {\n \ |
|
|
\ \"thread_id\": \"1\",\n \ |
|
|
\ \"checkpoint_ns\": AnyStr(),\n \ |
|
|
\ \"checkpoint_id\": AnyStr(),\n \ |
|
|
\ \"checkpoint_map\": AnyDict(\n \ |
|
|
\ {\n \"\": AnyStr(),\n \ |
|
|
\ AnyStr(\"child:\"): AnyStr(),\n\ |
|
|
\ AnyStr(\n \ |
|
|
\ re.compile(r\"child:.+|child1:\")\n \ |
|
|
\ ): AnyStr(),\n \ |
|
|
\ }\n ),\n \ |
|
|
\ }\n },\n \ |
|
|
\ metadata={\n \"parents\"\ |
|
|
: AnyDict(\n {\n \ |
|
|
\ \"\": AnyStr(),\n \ |
|
|
\ AnyStr(\"child:\"): AnyStr(),\n \ |
|
|
\ }\n ),\n \ |
|
|
\ \"source\": \"loop\",\n \"step\": 1,\n\ |
|
|
\ },\n created_at=AnyStr(),\n\ |
|
|
\ parent_config=None,\n \ |
|
|
\ interrupts=(),\n ),\n \ |
|
|
\ ),\n ),\n next=(\"child_1\",),\n \ |
|
|
\ config={\n \"configurable\": {\n \ |
|
|
\ \"thread_id\": \"1\",\n \ |
|
|
\ \"checkpoint_ns\": AnyStr(\"child:\"),\n \"checkpoint_id\"\ |
|
|
: AnyStr(),\n \"checkpoint_map\": AnyDict(\n \ |
|
|
\ {\"\": AnyStr(), AnyStr(\"child:\"): AnyStr()}\n \ |
|
|
\ ),\n }\n \ |
|
|
\ },\n metadata={\n \"parents\": {\"\ |
|
|
\": AnyStr()},\n \"source\": \"loop\",\n \ |
|
|
\ \"step\": 0,\n },\n created_at=AnyStr(),\n\ |
|
|
\ parent_config=None,\n interrupts=(),\n\ |
|
|
\ ),\n ),\n ),\n next=(\"child\",),\n\ |
|
|
\ config={\n \"configurable\": {\n \"thread_id\"\ |
|
|
: \"1\",\n \"checkpoint_ns\": \"\",\n \"checkpoint_id\"\ |
|
|
: AnyStr(),\n }\n },\n metadata={\n \"parents\"\ |
|
|
: {},\n \"source\": \"loop\",\n \"step\": 1,\n },\n\ |
|
|
\ created_at=AnyStr(),\n parent_config=None,\n interrupts=(),\n\ |
|
|
\ )\n # # resume\n assert [c for c in app.stream(None, config, subgraphs=True,\ |
|
|
\ durability=\"exit\")] == [\n (\n (AnyStr(\"child:\"), AnyStr(\"\ |
|
|
child_1:\")),\n {\"grandchild_2\": {\"my_key\": \"hi my value here\ |
|
|
\ and there\"}},\n ),\n ((AnyStr(\"child:\"),), {\"child_1\": {\"\ |
|
|
my_key\": \"hi my value here and there\"}}),\n ((), {\"child\": {\"my_key\"\ |
|
|
: \"hi my value here and there\"}}),\n ((), {\"parent_2\": {\"my_key\"\ |
|
|
: \"hi my value here and there and back again\"}}),\n ]\n # get state with\ |
|
|
\ and without subgraphs\n assert (\n app.get_state(config)\n \ |
|
|
\ == app.get_state(config, subgraphs=True)\n == StateSnapshot(\n \ |
|
|
\ values={\"my_key\": \"hi my value here and there and back again\"},\n \ |
|
|
\ tasks=(),\n next=(),\n config={\n \ |
|
|
\ \"configurable\": {\n \"thread_id\": \"1\",\n \ |
|
|
\ \"checkpoint_ns\": \"\",\n \"checkpoint_id\"\ |
|
|
: AnyStr(),\n }\n },\n metadata={\n \ |
|
|
\ \"parents\": {},\n \"source\": \"loop\",\n \ |
|
|
\ \"step\": 3,\n },\n created_at=AnyStr(),\n \ |
|
|
\ parent_config=(\n {\n \"configurable\"\ |
|
|
: {\n \"thread_id\": \"1\",\n \"\ |
|
|
checkpoint_ns\": \"\",\n \"checkpoint_id\": AnyStr(),\n\ |
|
|
\ }\n }\n ),\n interrupts=(),\n\ |
|
|
\ )\n )\n\n # get outer graph history\n outer_history = list(app.get_state_history(config))\n\ |
|
|
\ assert outer_history == [\n StateSnapshot(\n values={\"\ |
|
|
my_key\": \"hi my value here and there and back again\"},\n tasks=(),\n\ |
|
|
\ next=(),\n config={\n \"configurable\"\ |
|
|
: {\n \"thread_id\": \"1\",\n \"checkpoint_ns\"\ |
|
|
: \"\",\n \"checkpoint_id\": AnyStr(),\n }\n\ |
|
|
\ },\n metadata={\n \"parents\": {},\n \ |
|
|
\ \"source\": \"loop\",\n \"step\": 3,\n \ |
|
|
\ },\n created_at=AnyStr(),\n parent_config={\n \ |
|
|
\ \"configurable\": {\n \"thread_id\": \"1\",\n \ |
|
|
\ \"checkpoint_ns\": \"\",\n \"checkpoint_id\"\ |
|
|
: AnyStr(),\n }\n },\n interrupts=(),\n \ |
|
|
\ ),\n StateSnapshot(\n values={\"my_key\": \"hi my value\"\ |
|
|
},\n tasks=(\n PregelTask(\n AnyStr(),\n\ |
|
|
\ \"child\",\n (PULL, \"child\"),\n \ |
|
|
\ state={\n \"configurable\": {\n \ |
|
|
\ \"thread_id\": \"1\",\n \"checkpoint_ns\"\ |
|
|
: AnyStr(\"child\"),\n }\n },\n \ |
|
|
\ result=None,\n ),\n ),\n \ |
|
|
\ next=(\"child\",),\n config={\n \"configurable\"\ |
|
|
: {\n \"thread_id\": \"1\",\n \"checkpoint_ns\"\ |
|
|
: \"\",\n \"checkpoint_id\": AnyStr(),\n }\n\ |
|
|
\ },\n metadata={\n \"parents\": {},\n \ |
|
|
\ \"source\": \"loop\",\n \"step\": 1,\n \ |
|
|
\ },\n created_at=AnyStr(),\n parent_config=None,\n \ |
|
|
\ interrupts=(),\n ),\n ]\n # get child graph history\n\ |
|
|
\ child_history = list(app.get_state_history(outer_history[1].tasks[0].state))\n\ |
|
|
\ assert child_history == [\n StateSnapshot(\n values={\"\ |
|
|
my_key\": \"hi my value\"},\n next=(\"child_1\",),\n config={\n\ |
|
|
\ \"configurable\": {\n \"thread_id\": \"1\"\ |
|
|
,\n \"checkpoint_ns\": AnyStr(\"child:\"),\n \ |
|
|
\ \"checkpoint_id\": AnyStr(),\n \"checkpoint_map\": AnyDict(\n\ |
|
|
\ {\"\": AnyStr(), AnyStr(\"child:\"): AnyStr()}\n \ |
|
|
\ ),\n }\n },\n metadata={\n\ |
|
|
\ \"source\": \"loop\",\n \"step\": 0,\n \ |
|
|
\ \"parents\": {\"\": AnyStr()},\n },\n created_at=AnyStr(),\n\ |
|
|
\ parent_config=None,\n tasks=(\n PregelTask(\n\ |
|
|
\ id=AnyStr(),\n name=\"child_1\",\n \ |
|
|
\ path=(PULL, \"child_1\"),\n state={\n \ |
|
|
\ \"configurable\": {\n \"thread_id\"\ |
|
|
: \"1\",\n \"checkpoint_ns\": AnyStr(\"child:\"),\n\ |
|
|
\ }\n },\n result=None,\n\ |
|
|
\ ),\n ),\n interrupts=(),\n ),\n\ |
|
|
\ ]\n # get grandchild graph history\n grandchild_history = list(app.get_state_history(child_history[0].tasks[0].state))\n\ |
|
|
\ assert grandchild_history == [\n StateSnapshot(\n values={\"\ |
|
|
my_key\": \"hi my value here\"},\n next=(\"grandchild_2\",),\n \ |
|
|
\ config={\n \"configurable\": {\n \"\ |
|
|
thread_id\": \"1\",\n \"checkpoint_ns\": AnyStr(),\n \ |
|
|
\ \"checkpoint_id\": AnyStr(),\n \"checkpoint_map\"\ |
|
|
: AnyDict(\n {\n \"\": AnyStr(),\n\ |
|
|
\ AnyStr(\"child:\"): AnyStr(),\n \ |
|
|
\ AnyStr(re.compile(r\"child:.+|child1:\")): AnyStr(),\n \ |
|
|
\ }\n ),\n }\n },\n \ |
|
|
\ metadata={\n \"source\": \"loop\",\n \ |
|
|
\ \"step\": 1,\n \"parents\": AnyDict(\n {\n\ |
|
|
\ \"\": AnyStr(),\n AnyStr(\"child:\"\ |
|
|
): AnyStr(),\n }\n ),\n },\n \ |
|
|
\ created_at=AnyStr(),\n parent_config=None,\n tasks=(\n\ |
|
|
\ PregelTask(\n id=AnyStr(),\n \ |
|
|
\ name=\"grandchild_2\",\n path=(PULL, \"grandchild_2\"\ |
|
|
),\n result=None,\n ),\n ),\n \ |
|
|
\ interrupts=(),\n ),\n ]" |
|
|
- "def _msgpack_enc(data: Any) -> bytes:\n return ormsgpack.packb(data, default=_msgpack_default,\ |
|
|
\ option=_option)" |
|
|
- "def test_list_namespaces_operations(\n fake_embeddings: CharacterEmbeddings,\n\ |
|
|
) -> None:\n \"\"\"Test list namespaces functionality with various filters.\"\ |
|
|
\"\"\n with create_vector_store(\n fake_embeddings, text_fields=[\"\ |
|
|
key0\", \"key1\", \"key3\"]\n ) as store:\n test_pref = str(uuid.uuid4())\n\ |
|
|
\ test_namespaces = [\n (test_pref, \"test\", \"documents\"\ |
|
|
, \"public\", test_pref),\n (test_pref, \"test\", \"documents\", \"\ |
|
|
private\", test_pref),\n (test_pref, \"test\", \"images\", \"public\"\ |
|
|
, test_pref),\n (test_pref, \"test\", \"images\", \"private\", test_pref),\n\ |
|
|
\ (test_pref, \"prod\", \"documents\", \"public\", test_pref),\n \ |
|
|
\ (test_pref, \"prod\", \"documents\", \"some\", \"nesting\", \"public\"\ |
|
|
, test_pref),\n (test_pref, \"prod\", \"documents\", \"private\", test_pref),\n\ |
|
|
\ ]\n\n # Add test data\n for namespace in test_namespaces:\n\ |
|
|
\ store.put(namespace, \"dummy\", {\"content\": \"dummy\"})\n\n \ |
|
|
\ # Test prefix filtering\n prefix_result = store.list_namespaces(prefix=(test_pref,\ |
|
|
\ \"test\"))\n assert len(prefix_result) == 4\n assert all(ns[1]\ |
|
|
\ == \"test\" for ns in prefix_result)\n\n # Test specific prefix\n \ |
|
|
\ specific_prefix_result = store.list_namespaces(\n prefix=(test_pref,\ |
|
|
\ \"test\", \"documents\")\n )\n assert len(specific_prefix_result)\ |
|
|
\ == 2\n assert all(ns[1:3] == (\"test\", \"documents\") for ns in specific_prefix_result)\n\ |
|
|
\n # Test suffix filtering\n suffix_result = store.list_namespaces(suffix=(\"\ |
|
|
public\", test_pref))\n assert len(suffix_result) == 4\n assert\ |
|
|
\ all(ns[-2] == \"public\" for ns in suffix_result)\n\n # Test combined\ |
|
|
\ prefix and suffix\n prefix_suffix_result = store.list_namespaces(\n \ |
|
|
\ prefix=(test_pref, \"test\"), suffix=(\"public\", test_pref)\n \ |
|
|
\ )\n assert len(prefix_suffix_result) == 2\n assert all(\n\ |
|
|
\ ns[1] == \"test\" and ns[-2] == \"public\" for ns in prefix_suffix_result\n\ |
|
|
\ )\n\n # Test wildcard in prefix\n wildcard_prefix_result\ |
|
|
\ = store.list_namespaces(\n prefix=(test_pref, \"*\", \"documents\"\ |
|
|
)\n )\n assert len(wildcard_prefix_result) == 5\n assert\ |
|
|
\ all(ns[2] == \"documents\" for ns in wildcard_prefix_result)\n\n # Test\ |
|
|
\ wildcard in suffix\n wildcard_suffix_result = store.list_namespaces(\n\ |
|
|
\ suffix=(\"*\", \"public\", test_pref)\n )\n assert\ |
|
|
\ len(wildcard_suffix_result) == 4\n assert all(ns[-2] == \"public\" for\ |
|
|
\ ns in wildcard_suffix_result)\n\n wildcard_single = store.list_namespaces(\n\ |
|
|
\ suffix=(\"some\", \"*\", \"public\", test_pref)\n )\n \ |
|
|
\ assert len(wildcard_single) == 1\n assert wildcard_single[0] == (\n\ |
|
|
\ test_pref,\n \"prod\",\n \"documents\",\n \ |
|
|
\ \"some\",\n \"nesting\",\n \"public\",\n \ |
|
|
\ test_pref,\n )\n\n # Test max depth\n max_depth_result\ |
|
|
\ = store.list_namespaces(max_depth=3)\n assert all(len(ns) <= 3 for ns\ |
|
|
\ in max_depth_result)\n\n max_depth_result = store.list_namespaces(\n\ |
|
|
\ max_depth=4, prefix=(test_pref, \"*\", \"documents\")\n )\n\ |
|
|
\ assert len(set(res for res in max_depth_result)) == len(max_depth_result)\ |
|
|
\ == 5\n\n # Test pagination\n limit_result = store.list_namespaces(prefix=(test_pref,),\ |
|
|
\ limit=3)\n assert len(limit_result) == 3\n\n offset_result = store.list_namespaces(prefix=(test_pref,),\ |
|
|
\ offset=3)\n assert len(offset_result) == len(test_namespaces) - 3\n\n\ |
|
|
\ empty_prefix_result = store.list_namespaces(prefix=(test_pref,))\n \ |
|
|
\ assert len(empty_prefix_result) == len(test_namespaces)\n assert\ |
|
|
\ set(empty_prefix_result) == set(test_namespaces)\n\n # Clean up\n \ |
|
|
\ for namespace in test_namespaces:\n store.delete(namespace, \"\ |
|
|
dummy\")" |
|
|
pipeline_tag: sentence-similarity |
|
|
library_name: sentence-transformers |
|
|
metrics: |
|
|
- cosine_accuracy@1 |
|
|
- cosine_accuracy@3 |
|
|
- cosine_accuracy@5 |
|
|
- cosine_accuracy@10 |
|
|
- cosine_precision@1 |
|
|
- cosine_precision@3 |
|
|
- cosine_precision@5 |
|
|
- cosine_precision@10 |
|
|
- cosine_recall@1 |
|
|
- cosine_recall@3 |
|
|
- cosine_recall@5 |
|
|
- cosine_recall@10 |
|
|
- cosine_ndcg@10 |
|
|
- cosine_mrr@10 |
|
|
- cosine_map@100 |
|
|
model-index: |
|
|
- name: codeBert dense retriever |
|
|
results: |
|
|
- task: |
|
|
type: information-retrieval |
|
|
name: Information Retrieval |
|
|
dataset: |
|
|
name: dim 768 |
|
|
type: dim_768 |
|
|
metrics: |
|
|
- type: cosine_accuracy@1 |
|
|
value: 0.9 |
|
|
name: Cosine Accuracy@1 |
|
|
- type: cosine_accuracy@3 |
|
|
value: 0.9 |
|
|
name: Cosine Accuracy@3 |
|
|
- type: cosine_accuracy@5 |
|
|
value: 1.0 |
|
|
name: Cosine Accuracy@5 |
|
|
- type: cosine_accuracy@10 |
|
|
value: 1.0 |
|
|
name: Cosine Accuracy@10 |
|
|
- type: cosine_precision@1 |
|
|
value: 0.9 |
|
|
name: Cosine Precision@1 |
|
|
- type: cosine_precision@3 |
|
|
value: 0.29999999999999993 |
|
|
name: Cosine Precision@3 |
|
|
- type: cosine_precision@5 |
|
|
value: 0.20000000000000004 |
|
|
name: Cosine Precision@5 |
|
|
- type: cosine_precision@10 |
|
|
value: 0.10000000000000002 |
|
|
name: Cosine Precision@10 |
|
|
- type: cosine_recall@1 |
|
|
value: 0.9 |
|
|
name: Cosine Recall@1 |
|
|
- type: cosine_recall@3 |
|
|
value: 0.9 |
|
|
name: Cosine Recall@3 |
|
|
- type: cosine_recall@5 |
|
|
value: 1.0 |
|
|
name: Cosine Recall@5 |
|
|
- type: cosine_recall@10 |
|
|
value: 1.0 |
|
|
name: Cosine Recall@10 |
|
|
- type: cosine_ndcg@10 |
|
|
value: 0.9408764682653967 |
|
|
name: Cosine Ndcg@10 |
|
|
- type: cosine_mrr@10 |
|
|
value: 0.9225 |
|
|
name: Cosine Mrr@10 |
|
|
- type: cosine_map@100 |
|
|
value: 0.9225 |
|
|
name: Cosine Map@100 |
|
|
- task: |
|
|
type: information-retrieval |
|
|
name: Information Retrieval |
|
|
dataset: |
|
|
name: dim 512 |
|
|
type: dim_512 |
|
|
metrics: |
|
|
- type: cosine_accuracy@1 |
|
|
value: 0.9 |
|
|
name: Cosine Accuracy@1 |
|
|
- type: cosine_accuracy@3 |
|
|
value: 0.9 |
|
|
name: Cosine Accuracy@3 |
|
|
- type: cosine_accuracy@5 |
|
|
value: 1.0 |
|
|
name: Cosine Accuracy@5 |
|
|
- type: cosine_accuracy@10 |
|
|
value: 1.0 |
|
|
name: Cosine Accuracy@10 |
|
|
- type: cosine_precision@1 |
|
|
value: 0.9 |
|
|
name: Cosine Precision@1 |
|
|
- type: cosine_precision@3 |
|
|
value: 0.29999999999999993 |
|
|
name: Cosine Precision@3 |
|
|
- type: cosine_precision@5 |
|
|
value: 0.20000000000000004 |
|
|
name: Cosine Precision@5 |
|
|
- type: cosine_precision@10 |
|
|
value: 0.10000000000000002 |
|
|
name: Cosine Precision@10 |
|
|
- type: cosine_recall@1 |
|
|
value: 0.9 |
|
|
name: Cosine Recall@1 |
|
|
- type: cosine_recall@3 |
|
|
value: 0.9 |
|
|
name: Cosine Recall@3 |
|
|
- type: cosine_recall@5 |
|
|
value: 1.0 |
|
|
name: Cosine Recall@5 |
|
|
- type: cosine_recall@10 |
|
|
value: 1.0 |
|
|
name: Cosine Recall@10 |
|
|
- type: cosine_ndcg@10 |
|
|
value: 0.9408764682653967 |
|
|
name: Cosine Ndcg@10 |
|
|
- type: cosine_mrr@10 |
|
|
value: 0.9225 |
|
|
name: Cosine Mrr@10 |
|
|
- type: cosine_map@100 |
|
|
value: 0.9225 |
|
|
name: Cosine Map@100 |
|
|
- task: |
|
|
type: information-retrieval |
|
|
name: Information Retrieval |
|
|
dataset: |
|
|
name: dim 256 |
|
|
type: dim_256 |
|
|
metrics: |
|
|
- type: cosine_accuracy@1 |
|
|
value: 0.9 |
|
|
name: Cosine Accuracy@1 |
|
|
- type: cosine_accuracy@3 |
|
|
value: 0.9 |
|
|
name: Cosine Accuracy@3 |
|
|
- type: cosine_accuracy@5 |
|
|
value: 1.0 |
|
|
name: Cosine Accuracy@5 |
|
|
- type: cosine_accuracy@10 |
|
|
value: 1.0 |
|
|
name: Cosine Accuracy@10 |
|
|
- type: cosine_precision@1 |
|
|
value: 0.9 |
|
|
name: Cosine Precision@1 |
|
|
- type: cosine_precision@3 |
|
|
value: 0.29999999999999993 |
|
|
name: Cosine Precision@3 |
|
|
- type: cosine_precision@5 |
|
|
value: 0.20000000000000004 |
|
|
name: Cosine Precision@5 |
|
|
- type: cosine_precision@10 |
|
|
value: 0.10000000000000002 |
|
|
name: Cosine Precision@10 |
|
|
- type: cosine_recall@1 |
|
|
value: 0.9 |
|
|
name: Cosine Recall@1 |
|
|
- type: cosine_recall@3 |
|
|
value: 0.9 |
|
|
name: Cosine Recall@3 |
|
|
- type: cosine_recall@5 |
|
|
value: 1.0 |
|
|
name: Cosine Recall@5 |
|
|
- type: cosine_recall@10 |
|
|
value: 1.0 |
|
|
name: Cosine Recall@10 |
|
|
- type: cosine_ndcg@10 |
|
|
value: 0.9408764682653967 |
|
|
name: Cosine Ndcg@10 |
|
|
- type: cosine_mrr@10 |
|
|
value: 0.9225 |
|
|
name: Cosine Mrr@10 |
|
|
- type: cosine_map@100 |
|
|
value: 0.9225 |
|
|
name: Cosine Map@100 |
|
|
- task: |
|
|
type: information-retrieval |
|
|
name: Information Retrieval |
|
|
dataset: |
|
|
name: dim 128 |
|
|
type: dim_128 |
|
|
metrics: |
|
|
- type: cosine_accuracy@1 |
|
|
value: 0.85 |
|
|
name: Cosine Accuracy@1 |
|
|
- type: cosine_accuracy@3 |
|
|
value: 0.9 |
|
|
name: Cosine Accuracy@3 |
|
|
- type: cosine_accuracy@5 |
|
|
value: 0.95 |
|
|
name: Cosine Accuracy@5 |
|
|
- type: cosine_accuracy@10 |
|
|
value: 0.95 |
|
|
name: Cosine Accuracy@10 |
|
|
- type: cosine_precision@1 |
|
|
value: 0.85 |
|
|
name: Cosine Precision@1 |
|
|
- type: cosine_precision@3 |
|
|
value: 0.29999999999999993 |
|
|
name: Cosine Precision@3 |
|
|
- type: cosine_precision@5 |
|
|
value: 0.19000000000000003 |
|
|
name: Cosine Precision@5 |
|
|
- type: cosine_precision@10 |
|
|
value: 0.09500000000000001 |
|
|
name: Cosine Precision@10 |
|
|
- type: cosine_recall@1 |
|
|
value: 0.85 |
|
|
name: Cosine Recall@1 |
|
|
- type: cosine_recall@3 |
|
|
value: 0.9 |
|
|
name: Cosine Recall@3 |
|
|
- type: cosine_recall@5 |
|
|
value: 0.95 |
|
|
name: Cosine Recall@5 |
|
|
- type: cosine_recall@10 |
|
|
value: 0.95 |
|
|
name: Cosine Recall@10 |
|
|
- type: cosine_ndcg@10 |
|
|
value: 0.894342640361727 |
|
|
name: Cosine Ndcg@10 |
|
|
- type: cosine_mrr@10 |
|
|
value: 0.8766666666666666 |
|
|
name: Cosine Mrr@10 |
|
|
- type: cosine_map@100 |
|
|
value: 0.8799999999999999 |
|
|
name: Cosine Map@100 |
|
|
- task: |
|
|
type: information-retrieval |
|
|
name: Information Retrieval |
|
|
dataset: |
|
|
name: dim 64 |
|
|
type: dim_64 |
|
|
metrics: |
|
|
- type: cosine_accuracy@1 |
|
|
value: 0.85 |
|
|
name: Cosine Accuracy@1 |
|
|
- type: cosine_accuracy@3 |
|
|
value: 0.9 |
|
|
name: Cosine Accuracy@3 |
|
|
- type: cosine_accuracy@5 |
|
|
value: 0.9 |
|
|
name: Cosine Accuracy@5 |
|
|
- type: cosine_accuracy@10 |
|
|
value: 1.0 |
|
|
name: Cosine Accuracy@10 |
|
|
- type: cosine_precision@1 |
|
|
value: 0.85 |
|
|
name: Cosine Precision@1 |
|
|
- type: cosine_precision@3 |
|
|
value: 0.29999999999999993 |
|
|
name: Cosine Precision@3 |
|
|
- type: cosine_precision@5 |
|
|
value: 0.18000000000000005 |
|
|
name: Cosine Precision@5 |
|
|
- type: cosine_precision@10 |
|
|
value: 0.10000000000000002 |
|
|
name: Cosine Precision@10 |
|
|
- type: cosine_recall@1 |
|
|
value: 0.85 |
|
|
name: Cosine Recall@1 |
|
|
- type: cosine_recall@3 |
|
|
value: 0.9 |
|
|
name: Cosine Recall@3 |
|
|
- type: cosine_recall@5 |
|
|
value: 0.9 |
|
|
name: Cosine Recall@5 |
|
|
- type: cosine_recall@10 |
|
|
value: 1.0 |
|
|
name: Cosine Recall@10 |
|
|
- type: cosine_ndcg@10 |
|
|
value: 0.9074399105059531 |
|
|
name: Cosine Ndcg@10 |
|
|
- type: cosine_mrr@10 |
|
|
value: 0.8800595238095237 |
|
|
name: Cosine Mrr@10 |
|
|
- type: cosine_map@100 |
|
|
value: 0.8800595238095237 |
|
|
name: Cosine Map@100 |
|
|
--- |
|
|
|
|
|
# codeBert dense retriever |
|
|
|
|
|
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [shubharuidas/codebert-embed-base-dense-retriever](https://huggingface.co/shubharuidas/codebert-embed-base-dense-retriever). It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more. |
|
|
|
|
|
## Model Details |
|
|
|
|
|
### Model Description |
|
|
- **Model Type:** Sentence Transformer |
|
|
- **Base model:** [shubharuidas/codebert-embed-base-dense-retriever](https://huggingface.co/shubharuidas/codebert-embed-base-dense-retriever) <!-- at revision 9594580ae943039d0b85feb304404f9b2bb203ce --> |
|
|
- **Maximum Sequence Length:** 512 tokens |
|
|
- **Output Dimensionality:** 768 dimensions |
|
|
- **Similarity Function:** Cosine Similarity |
|
|
<!-- - **Training Dataset:** Unknown --> |
|
|
- **Language:** en |
|
|
- **License:** apache-2.0 |
|
|
|
|
|
### Model Sources |
|
|
|
|
|
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net) |
|
|
- **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers) |
|
|
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers) |
|
|
|
|
|
### Full Model Architecture |
|
|
|
|
|
``` |
|
|
SentenceTransformer( |
|
|
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'RobertaModel'}) |
|
|
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True}) |
|
|
) |
|
|
``` |
|
|
|
|
|
## Usage |
|
|
|
|
|
### Direct Usage (Sentence Transformers) |
|
|
|
|
|
First install the Sentence Transformers library: |
|
|
|
|
|
```bash |
|
|
pip install -U sentence-transformers |
|
|
``` |
|
|
|
|
|
Then you can load this model and run inference. |
|
|
```python |
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
# Download from the 🤗 Hub |
|
|
model = SentenceTransformer("anaghaj111/codebert-base-code-embed-mrl-langchain-langgraph") |
|
|
# Run inference |
|
|
sentences = [ |
|
|
'Best practices for test_list_namespaces_operations', |
|
|
'def test_list_namespaces_operations(\n fake_embeddings: CharacterEmbeddings,\n) -> None:\n """Test list namespaces functionality with various filters."""\n with create_vector_store(\n fake_embeddings, text_fields=["key0", "key1", "key3"]\n ) as store:\n test_pref = str(uuid.uuid4())\n test_namespaces = [\n (test_pref, "test", "documents", "public", test_pref),\n (test_pref, "test", "documents", "private", test_pref),\n (test_pref, "test", "images", "public", test_pref),\n (test_pref, "test", "images", "private", test_pref),\n (test_pref, "prod", "documents", "public", test_pref),\n (test_pref, "prod", "documents", "some", "nesting", "public", test_pref),\n (test_pref, "prod", "documents", "private", test_pref),\n ]\n\n # Add test data\n for namespace in test_namespaces:\n store.put(namespace, "dummy", {"content": "dummy"})\n\n # Test prefix filtering\n prefix_result = store.list_namespaces(prefix=(test_pref, "test"))\n assert len(prefix_result) == 4\n assert all(ns[1] == "test" for ns in prefix_result)\n\n # Test specific prefix\n specific_prefix_result = store.list_namespaces(\n prefix=(test_pref, "test", "documents")\n )\n assert len(specific_prefix_result) == 2\n assert all(ns[1:3] == ("test", "documents") for ns in specific_prefix_result)\n\n # Test suffix filtering\n suffix_result = store.list_namespaces(suffix=("public", test_pref))\n assert len(suffix_result) == 4\n assert all(ns[-2] == "public" for ns in suffix_result)\n\n # Test combined prefix and suffix\n prefix_suffix_result = store.list_namespaces(\n prefix=(test_pref, "test"), suffix=("public", test_pref)\n )\n assert len(prefix_suffix_result) == 2\n assert all(\n ns[1] == "test" and ns[-2] == "public" for ns in prefix_suffix_result\n )\n\n # Test wildcard in prefix\n wildcard_prefix_result = store.list_namespaces(\n prefix=(test_pref, "*", "documents")\n )\n assert len(wildcard_prefix_result) == 5\n assert all(ns[2] == "documents" for ns in wildcard_prefix_result)\n\n # Test wildcard in suffix\n wildcard_suffix_result = store.list_namespaces(\n suffix=("*", "public", test_pref)\n )\n assert len(wildcard_suffix_result) == 4\n assert all(ns[-2] == "public" for ns in wildcard_suffix_result)\n\n wildcard_single = store.list_namespaces(\n suffix=("some", "*", "public", test_pref)\n )\n assert len(wildcard_single) == 1\n assert wildcard_single[0] == (\n test_pref,\n "prod",\n "documents",\n "some",\n "nesting",\n "public",\n test_pref,\n )\n\n # Test max depth\n max_depth_result = store.list_namespaces(max_depth=3)\n assert all(len(ns) <= 3 for ns in max_depth_result)\n\n max_depth_result = store.list_namespaces(\n max_depth=4, prefix=(test_pref, "*", "documents")\n )\n assert len(set(res for res in max_depth_result)) == len(max_depth_result) == 5\n\n # Test pagination\n limit_result = store.list_namespaces(prefix=(test_pref,), limit=3)\n assert len(limit_result) == 3\n\n offset_result = store.list_namespaces(prefix=(test_pref,), offset=3)\n assert len(offset_result) == len(test_namespaces) - 3\n\n empty_prefix_result = store.list_namespaces(prefix=(test_pref,))\n assert len(empty_prefix_result) == len(test_namespaces)\n assert set(empty_prefix_result) == set(test_namespaces)\n\n # Clean up\n for namespace in test_namespaces:\n store.delete(namespace, "dummy")', |
|
|
'def test_doubly_nested_graph_state(\n sync_checkpointer: BaseCheckpointSaver,\n) -> None:\n class State(TypedDict):\n my_key: str\n\n class ChildState(TypedDict):\n my_key: str\n\n class GrandChildState(TypedDict):\n my_key: str\n\n def grandchild_1(state: ChildState):\n return {"my_key": state["my_key"] + " here"}\n\n def grandchild_2(state: ChildState):\n return {\n "my_key": state["my_key"] + " and there",\n }\n\n grandchild = StateGraph(GrandChildState)\n grandchild.add_node("grandchild_1", grandchild_1)\n grandchild.add_node("grandchild_2", grandchild_2)\n grandchild.add_edge("grandchild_1", "grandchild_2")\n grandchild.set_entry_point("grandchild_1")\n grandchild.set_finish_point("grandchild_2")\n\n child = StateGraph(ChildState)\n child.add_node(\n "child_1",\n grandchild.compile(interrupt_before=["grandchild_2"]),\n )\n child.set_entry_point("child_1")\n child.set_finish_point("child_1")\n\n def parent_1(state: State):\n return {"my_key": "hi " + state["my_key"]}\n\n def parent_2(state: State):\n return {"my_key": state["my_key"] + " and back again"}\n\n graph = StateGraph(State)\n graph.add_node("parent_1", parent_1)\n graph.add_node("child", child.compile())\n graph.add_node("parent_2", parent_2)\n graph.set_entry_point("parent_1")\n graph.add_edge("parent_1", "child")\n graph.add_edge("child", "parent_2")\n graph.set_finish_point("parent_2")\n\n app = graph.compile(checkpointer=sync_checkpointer)\n\n # test invoke w/ nested interrupt\n config = {"configurable": {"thread_id": "1"}}\n assert [\n c\n for c in app.stream(\n {"my_key": "my value"}, config, subgraphs=True, durability="exit"\n )\n ] == [\n ((), {"parent_1": {"my_key": "hi my value"}}),\n (\n (AnyStr("child:"), AnyStr("child_1:")),\n {"grandchild_1": {"my_key": "hi my value here"}},\n ),\n ((), {"__interrupt__": ()}),\n ]\n # get state without subgraphs\n outer_state = app.get_state(config)\n assert outer_state == StateSnapshot(\n values={"my_key": "hi my value"},\n tasks=(\n PregelTask(\n AnyStr(),\n "child",\n (PULL, "child"),\n state={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": AnyStr("child"),\n }\n },\n ),\n ),\n next=("child",),\n config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": "",\n "checkpoint_id": AnyStr(),\n }\n },\n metadata={\n "parents": {},\n "source": "loop",\n "step": 1,\n },\n created_at=AnyStr(),\n parent_config=None,\n interrupts=(),\n )\n child_state = app.get_state(outer_state.tasks[0].state)\n assert child_state == StateSnapshot(\n values={"my_key": "hi my value"},\n tasks=(\n PregelTask(\n AnyStr(),\n "child_1",\n (PULL, "child_1"),\n state={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": AnyStr(),\n }\n },\n ),\n ),\n next=("child_1",),\n config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": AnyStr("child:"),\n "checkpoint_id": AnyStr(),\n "checkpoint_map": AnyDict(\n {\n "": AnyStr(),\n AnyStr("child:"): AnyStr(),\n }\n ),\n }\n },\n metadata={\n "parents": {"": AnyStr()},\n "source": "loop",\n "step": 0,\n },\n created_at=AnyStr(),\n parent_config=None,\n interrupts=(),\n )\n grandchild_state = app.get_state(child_state.tasks[0].state)\n assert grandchild_state == StateSnapshot(\n values={"my_key": "hi my value here"},\n tasks=(\n PregelTask(\n AnyStr(),\n "grandchild_2",\n (PULL, "grandchild_2"),\n ),\n ),\n next=("grandchild_2",),\n config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": AnyStr(),\n "checkpoint_id": AnyStr(),\n "checkpoint_map": AnyDict(\n {\n "": AnyStr(),\n AnyStr("child:"): AnyStr(),\n AnyStr(re.compile(r"child:.+|child1:")): AnyStr(),\n }\n ),\n }\n },\n metadata={\n "parents": AnyDict(\n {\n "": AnyStr(),\n AnyStr("child:"): AnyStr(),\n }\n ),\n "source": "loop",\n "step": 1,\n },\n created_at=AnyStr(),\n parent_config=None,\n interrupts=(),\n )\n # get state with subgraphs\n assert app.get_state(config, subgraphs=True) == StateSnapshot(\n values={"my_key": "hi my value"},\n tasks=(\n PregelTask(\n AnyStr(),\n "child",\n (PULL, "child"),\n state=StateSnapshot(\n values={"my_key": "hi my value"},\n tasks=(\n PregelTask(\n AnyStr(),\n "child_1",\n (PULL, "child_1"),\n state=StateSnapshot(\n values={"my_key": "hi my value here"},\n tasks=(\n PregelTask(\n AnyStr(),\n "grandchild_2",\n (PULL, "grandchild_2"),\n ),\n ),\n next=("grandchild_2",),\n config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": AnyStr(),\n "checkpoint_id": AnyStr(),\n "checkpoint_map": AnyDict(\n {\n "": AnyStr(),\n AnyStr("child:"): AnyStr(),\n AnyStr(\n re.compile(r"child:.+|child1:")\n ): AnyStr(),\n }\n ),\n }\n },\n metadata={\n "parents": AnyDict(\n {\n "": AnyStr(),\n AnyStr("child:"): AnyStr(),\n }\n ),\n "source": "loop",\n "step": 1,\n },\n created_at=AnyStr(),\n parent_config=None,\n interrupts=(),\n ),\n ),\n ),\n next=("child_1",),\n config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": AnyStr("child:"),\n "checkpoint_id": AnyStr(),\n "checkpoint_map": AnyDict(\n {"": AnyStr(), AnyStr("child:"): AnyStr()}\n ),\n }\n },\n metadata={\n "parents": {"": AnyStr()},\n "source": "loop",\n "step": 0,\n },\n created_at=AnyStr(),\n parent_config=None,\n interrupts=(),\n ),\n ),\n ),\n next=("child",),\n config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": "",\n "checkpoint_id": AnyStr(),\n }\n },\n metadata={\n "parents": {},\n "source": "loop",\n "step": 1,\n },\n created_at=AnyStr(),\n parent_config=None,\n interrupts=(),\n )\n # # resume\n assert [c for c in app.stream(None, config, subgraphs=True, durability="exit")] == [\n (\n (AnyStr("child:"), AnyStr("child_1:")),\n {"grandchild_2": {"my_key": "hi my value here and there"}},\n ),\n ((AnyStr("child:"),), {"child_1": {"my_key": "hi my value here and there"}}),\n ((), {"child": {"my_key": "hi my value here and there"}}),\n ((), {"parent_2": {"my_key": "hi my value here and there and back again"}}),\n ]\n # get state with and without subgraphs\n assert (\n app.get_state(config)\n == app.get_state(config, subgraphs=True)\n == StateSnapshot(\n values={"my_key": "hi my value here and there and back again"},\n tasks=(),\n next=(),\n config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": "",\n "checkpoint_id": AnyStr(),\n }\n },\n metadata={\n "parents": {},\n "source": "loop",\n "step": 3,\n },\n created_at=AnyStr(),\n parent_config=(\n {\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": "",\n "checkpoint_id": AnyStr(),\n }\n }\n ),\n interrupts=(),\n )\n )\n\n # get outer graph history\n outer_history = list(app.get_state_history(config))\n assert outer_history == [\n StateSnapshot(\n values={"my_key": "hi my value here and there and back again"},\n tasks=(),\n next=(),\n config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": "",\n "checkpoint_id": AnyStr(),\n }\n },\n metadata={\n "parents": {},\n "source": "loop",\n "step": 3,\n },\n created_at=AnyStr(),\n parent_config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": "",\n "checkpoint_id": AnyStr(),\n }\n },\n interrupts=(),\n ),\n StateSnapshot(\n values={"my_key": "hi my value"},\n tasks=(\n PregelTask(\n AnyStr(),\n "child",\n (PULL, "child"),\n state={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": AnyStr("child"),\n }\n },\n result=None,\n ),\n ),\n next=("child",),\n config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": "",\n "checkpoint_id": AnyStr(),\n }\n },\n metadata={\n "parents": {},\n "source": "loop",\n "step": 1,\n },\n created_at=AnyStr(),\n parent_config=None,\n interrupts=(),\n ),\n ]\n # get child graph history\n child_history = list(app.get_state_history(outer_history[1].tasks[0].state))\n assert child_history == [\n StateSnapshot(\n values={"my_key": "hi my value"},\n next=("child_1",),\n config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": AnyStr("child:"),\n "checkpoint_id": AnyStr(),\n "checkpoint_map": AnyDict(\n {"": AnyStr(), AnyStr("child:"): AnyStr()}\n ),\n }\n },\n metadata={\n "source": "loop",\n "step": 0,\n "parents": {"": AnyStr()},\n },\n created_at=AnyStr(),\n parent_config=None,\n tasks=(\n PregelTask(\n id=AnyStr(),\n name="child_1",\n path=(PULL, "child_1"),\n state={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": AnyStr("child:"),\n }\n },\n result=None,\n ),\n ),\n interrupts=(),\n ),\n ]\n # get grandchild graph history\n grandchild_history = list(app.get_state_history(child_history[0].tasks[0].state))\n assert grandchild_history == [\n StateSnapshot(\n values={"my_key": "hi my value here"},\n next=("grandchild_2",),\n config={\n "configurable": {\n "thread_id": "1",\n "checkpoint_ns": AnyStr(),\n "checkpoint_id": AnyStr(),\n "checkpoint_map": AnyDict(\n {\n "": AnyStr(),\n AnyStr("child:"): AnyStr(),\n AnyStr(re.compile(r"child:.+|child1:")): AnyStr(),\n }\n ),\n }\n },\n metadata={\n "source": "loop",\n "step": 1,\n "parents": AnyDict(\n {\n "": AnyStr(),\n AnyStr("child:"): AnyStr(),\n }\n ),\n },\n created_at=AnyStr(),\n parent_config=None,\n tasks=(\n PregelTask(\n id=AnyStr(),\n name="grandchild_2",\n path=(PULL, "grandchild_2"),\n result=None,\n ),\n ),\n interrupts=(),\n ),\n ]', |
|
|
] |
|
|
embeddings = model.encode(sentences) |
|
|
print(embeddings.shape) |
|
|
# [3, 768] |
|
|
|
|
|
# Get the similarity scores for the embeddings |
|
|
similarities = model.similarity(embeddings, embeddings) |
|
|
print(similarities) |
|
|
# tensor([[1.0000, 0.7789, 0.3589], |
|
|
# [0.7789, 1.0000, 0.4748], |
|
|
# [0.3589, 0.4748, 1.0000]]) |
|
|
``` |
|
|
|
|
|
<!-- |
|
|
### Direct Usage (Transformers) |
|
|
|
|
|
<details><summary>Click to see the direct usage in Transformers</summary> |
|
|
|
|
|
</details> |
|
|
--> |
|
|
|
|
|
<!-- |
|
|
### Downstream Usage (Sentence Transformers) |
|
|
|
|
|
You can finetune this model on your own dataset. |
|
|
|
|
|
<details><summary>Click to expand</summary> |
|
|
|
|
|
</details> |
|
|
--> |
|
|
|
|
|
<!-- |
|
|
### Out-of-Scope Use |
|
|
|
|
|
*List how the model may foreseeably be misused and address what users ought not to do with the model.* |
|
|
--> |
|
|
|
|
|
## Evaluation |
|
|
|
|
|
### Metrics |
|
|
|
|
|
#### Information Retrieval |
|
|
|
|
|
* Dataset: `dim_768` |
|
|
* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator) with these parameters: |
|
|
```json |
|
|
{ |
|
|
"truncate_dim": 768 |
|
|
} |
|
|
``` |
|
|
|
|
|
| Metric | Value | |
|
|
|:--------------------|:-----------| |
|
|
| cosine_accuracy@1 | 0.9 | |
|
|
| cosine_accuracy@3 | 0.9 | |
|
|
| cosine_accuracy@5 | 1.0 | |
|
|
| cosine_accuracy@10 | 1.0 | |
|
|
| cosine_precision@1 | 0.9 | |
|
|
| cosine_precision@3 | 0.3 | |
|
|
| cosine_precision@5 | 0.2 | |
|
|
| cosine_precision@10 | 0.1 | |
|
|
| cosine_recall@1 | 0.9 | |
|
|
| cosine_recall@3 | 0.9 | |
|
|
| cosine_recall@5 | 1.0 | |
|
|
| cosine_recall@10 | 1.0 | |
|
|
| **cosine_ndcg@10** | **0.9409** | |
|
|
| cosine_mrr@10 | 0.9225 | |
|
|
| cosine_map@100 | 0.9225 | |
|
|
|
|
|
#### Information Retrieval |
|
|
|
|
|
* Dataset: `dim_512` |
|
|
* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator) with these parameters: |
|
|
```json |
|
|
{ |
|
|
"truncate_dim": 512 |
|
|
} |
|
|
``` |
|
|
|
|
|
| Metric | Value | |
|
|
|:--------------------|:-----------| |
|
|
| cosine_accuracy@1 | 0.9 | |
|
|
| cosine_accuracy@3 | 0.9 | |
|
|
| cosine_accuracy@5 | 1.0 | |
|
|
| cosine_accuracy@10 | 1.0 | |
|
|
| cosine_precision@1 | 0.9 | |
|
|
| cosine_precision@3 | 0.3 | |
|
|
| cosine_precision@5 | 0.2 | |
|
|
| cosine_precision@10 | 0.1 | |
|
|
| cosine_recall@1 | 0.9 | |
|
|
| cosine_recall@3 | 0.9 | |
|
|
| cosine_recall@5 | 1.0 | |
|
|
| cosine_recall@10 | 1.0 | |
|
|
| **cosine_ndcg@10** | **0.9409** | |
|
|
| cosine_mrr@10 | 0.9225 | |
|
|
| cosine_map@100 | 0.9225 | |
|
|
|
|
|
#### Information Retrieval |
|
|
|
|
|
* Dataset: `dim_256` |
|
|
* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator) with these parameters: |
|
|
```json |
|
|
{ |
|
|
"truncate_dim": 256 |
|
|
} |
|
|
``` |
|
|
|
|
|
| Metric | Value | |
|
|
|:--------------------|:-----------| |
|
|
| cosine_accuracy@1 | 0.9 | |
|
|
| cosine_accuracy@3 | 0.9 | |
|
|
| cosine_accuracy@5 | 1.0 | |
|
|
| cosine_accuracy@10 | 1.0 | |
|
|
| cosine_precision@1 | 0.9 | |
|
|
| cosine_precision@3 | 0.3 | |
|
|
| cosine_precision@5 | 0.2 | |
|
|
| cosine_precision@10 | 0.1 | |
|
|
| cosine_recall@1 | 0.9 | |
|
|
| cosine_recall@3 | 0.9 | |
|
|
| cosine_recall@5 | 1.0 | |
|
|
| cosine_recall@10 | 1.0 | |
|
|
| **cosine_ndcg@10** | **0.9409** | |
|
|
| cosine_mrr@10 | 0.9225 | |
|
|
| cosine_map@100 | 0.9225 | |
|
|
|
|
|
#### Information Retrieval |
|
|
|
|
|
* Dataset: `dim_128` |
|
|
* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator) with these parameters: |
|
|
```json |
|
|
{ |
|
|
"truncate_dim": 128 |
|
|
} |
|
|
``` |
|
|
|
|
|
| Metric | Value | |
|
|
|:--------------------|:-----------| |
|
|
| cosine_accuracy@1 | 0.85 | |
|
|
| cosine_accuracy@3 | 0.9 | |
|
|
| cosine_accuracy@5 | 0.95 | |
|
|
| cosine_accuracy@10 | 0.95 | |
|
|
| cosine_precision@1 | 0.85 | |
|
|
| cosine_precision@3 | 0.3 | |
|
|
| cosine_precision@5 | 0.19 | |
|
|
| cosine_precision@10 | 0.095 | |
|
|
| cosine_recall@1 | 0.85 | |
|
|
| cosine_recall@3 | 0.9 | |
|
|
| cosine_recall@5 | 0.95 | |
|
|
| cosine_recall@10 | 0.95 | |
|
|
| **cosine_ndcg@10** | **0.8943** | |
|
|
| cosine_mrr@10 | 0.8767 | |
|
|
| cosine_map@100 | 0.88 | |
|
|
|
|
|
#### Information Retrieval |
|
|
|
|
|
* Dataset: `dim_64` |
|
|
* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator) with these parameters: |
|
|
```json |
|
|
{ |
|
|
"truncate_dim": 64 |
|
|
} |
|
|
``` |
|
|
|
|
|
| Metric | Value | |
|
|
|:--------------------|:-----------| |
|
|
| cosine_accuracy@1 | 0.85 | |
|
|
| cosine_accuracy@3 | 0.9 | |
|
|
| cosine_accuracy@5 | 0.9 | |
|
|
| cosine_accuracy@10 | 1.0 | |
|
|
| cosine_precision@1 | 0.85 | |
|
|
| cosine_precision@3 | 0.3 | |
|
|
| cosine_precision@5 | 0.18 | |
|
|
| cosine_precision@10 | 0.1 | |
|
|
| cosine_recall@1 | 0.85 | |
|
|
| cosine_recall@3 | 0.9 | |
|
|
| cosine_recall@5 | 0.9 | |
|
|
| cosine_recall@10 | 1.0 | |
|
|
| **cosine_ndcg@10** | **0.9074** | |
|
|
| cosine_mrr@10 | 0.8801 | |
|
|
| cosine_map@100 | 0.8801 | |
|
|
|
|
|
<!-- |
|
|
## Bias, Risks and Limitations |
|
|
|
|
|
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.* |
|
|
--> |
|
|
|
|
|
<!-- |
|
|
### Recommendations |
|
|
|
|
|
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.* |
|
|
--> |
|
|
|
|
|
## Training Details |
|
|
|
|
|
### Training Dataset |
|
|
|
|
|
#### Unnamed Dataset |
|
|
|
|
|
* Size: 180 training samples |
|
|
* Columns: <code>anchor</code> and <code>positive</code> |
|
|
* Approximate statistics based on the first 180 samples: |
|
|
| | anchor | positive | |
|
|
|:--------|:-----------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------| |
|
|
| type | string | string | |
|
|
| details | <ul><li>min: 6 tokens</li><li>mean: 12.34 tokens</li><li>max: 117 tokens</li></ul> | <ul><li>min: 14 tokens</li><li>mean: 273.18 tokens</li><li>max: 512 tokens</li></ul> | |
|
|
* Samples: |
|
|
| anchor | positive | |
|
|
|:-----------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |
|
|
| <code>How to implement State?</code> | <code>class State(TypedDict):<br> messages: Annotated[list[str], operator.add]</code> | |
|
|
| <code>Best practices for test_sql_injection_vulnerability</code> | <code>def test_sql_injection_vulnerability(store: SqliteStore) -> None:<br> """Test that SQL injection via malicious filter keys is prevented."""<br> # Add public and private documents<br> store.put(("docs",), "public", {"access": "public", "data": "public info"})<br> store.put(<br> ("docs",), "private", {"access": "private", "data": "secret", "password": "123"}<br> )<br><br> # Normal query - returns 1 public document<br> normal = store.search(("docs",), filter={"access": "public"})<br> assert len(normal) == 1<br> assert normal[0].value["access"] == "public"<br><br> # SQL injection attempt via malicious key should raise ValueError<br> malicious_key = "access') = 'public' OR '1'='1' OR json_extract(value, '$."<br><br> with pytest.raises(ValueError, match="Invalid filter key"):<br> store.search(("docs",), filter={malicious_key: "dummy"})</code> | |
|
|
| <code>Example usage of put_writes</code> | <code>def put_writes(<br> self,<br> config: RunnableConfig,<br> writes: Sequence[tuple[str, Any]],<br> task_id: str,<br> task_path: str = "",<br> ) -> None:<br> """Store intermediate writes linked to a checkpoint.<br><br> This method saves intermediate writes associated with a checkpoint to the Postgres database.<br><br> Args:<br> config: Configuration of the related checkpoint.<br> writes: List of writes to store.<br> task_id: Identifier for the task creating the writes.<br> """<br> query = (<br> self.UPSERT_CHECKPOINT_WRITES_SQL<br> if all(w[0] in WRITES_IDX_MAP for w in writes)<br> else self.INSERT_CHECKPOINT_WRITES_SQL<br> )<br> with self._cursor(pipeline=True) as cur:<br> cur.executemany(<br> query,<br> self._dump_writes(<br> config["configurable"]["thread_id"],<br> config["configurable"]["checkpoint_ns"],<br> config["c...</code> | |
|
|
* Loss: [<code>MatryoshkaLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#matryoshkaloss) with these parameters: |
|
|
```json |
|
|
{ |
|
|
"loss": "MultipleNegativesRankingLoss", |
|
|
"matryoshka_dims": [ |
|
|
768, |
|
|
512, |
|
|
256, |
|
|
128, |
|
|
64 |
|
|
], |
|
|
"matryoshka_weights": [ |
|
|
1, |
|
|
1, |
|
|
1, |
|
|
1, |
|
|
1 |
|
|
], |
|
|
"n_dims_per_step": -1 |
|
|
} |
|
|
``` |
|
|
|
|
|
### Training Hyperparameters |
|
|
#### Non-Default Hyperparameters |
|
|
|
|
|
- `eval_strategy`: epoch |
|
|
- `per_device_train_batch_size`: 4 |
|
|
- `per_device_eval_batch_size`: 4 |
|
|
- `gradient_accumulation_steps`: 16 |
|
|
- `learning_rate`: 2e-05 |
|
|
- `num_train_epochs`: 2 |
|
|
- `lr_scheduler_type`: cosine |
|
|
- `warmup_ratio`: 0.1 |
|
|
- `fp16`: True |
|
|
- `load_best_model_at_end`: True |
|
|
- `optim`: adamw_torch |
|
|
- `batch_sampler`: no_duplicates |
|
|
|
|
|
#### All Hyperparameters |
|
|
<details><summary>Click to expand</summary> |
|
|
|
|
|
- `overwrite_output_dir`: False |
|
|
- `do_predict`: False |
|
|
- `eval_strategy`: epoch |
|
|
- `prediction_loss_only`: True |
|
|
- `per_device_train_batch_size`: 4 |
|
|
- `per_device_eval_batch_size`: 4 |
|
|
- `per_gpu_train_batch_size`: None |
|
|
- `per_gpu_eval_batch_size`: None |
|
|
- `gradient_accumulation_steps`: 16 |
|
|
- `eval_accumulation_steps`: None |
|
|
- `torch_empty_cache_steps`: None |
|
|
- `learning_rate`: 2e-05 |
|
|
- `weight_decay`: 0.0 |
|
|
- `adam_beta1`: 0.9 |
|
|
- `adam_beta2`: 0.999 |
|
|
- `adam_epsilon`: 1e-08 |
|
|
- `max_grad_norm`: 1.0 |
|
|
- `num_train_epochs`: 2 |
|
|
- `max_steps`: -1 |
|
|
- `lr_scheduler_type`: cosine |
|
|
- `lr_scheduler_kwargs`: {} |
|
|
- `warmup_ratio`: 0.1 |
|
|
- `warmup_steps`: 0 |
|
|
- `log_level`: passive |
|
|
- `log_level_replica`: warning |
|
|
- `log_on_each_node`: True |
|
|
- `logging_nan_inf_filter`: True |
|
|
- `save_safetensors`: True |
|
|
- `save_on_each_node`: False |
|
|
- `save_only_model`: False |
|
|
- `restore_callback_states_from_checkpoint`: False |
|
|
- `no_cuda`: False |
|
|
- `use_cpu`: False |
|
|
- `use_mps_device`: False |
|
|
- `seed`: 42 |
|
|
- `data_seed`: None |
|
|
- `jit_mode_eval`: False |
|
|
- `bf16`: False |
|
|
- `fp16`: True |
|
|
- `fp16_opt_level`: O1 |
|
|
- `half_precision_backend`: auto |
|
|
- `bf16_full_eval`: False |
|
|
- `fp16_full_eval`: False |
|
|
- `tf32`: None |
|
|
- `local_rank`: 0 |
|
|
- `ddp_backend`: None |
|
|
- `tpu_num_cores`: None |
|
|
- `tpu_metrics_debug`: False |
|
|
- `debug`: [] |
|
|
- `dataloader_drop_last`: False |
|
|
- `dataloader_num_workers`: 0 |
|
|
- `dataloader_prefetch_factor`: None |
|
|
- `past_index`: -1 |
|
|
- `disable_tqdm`: False |
|
|
- `remove_unused_columns`: True |
|
|
- `label_names`: None |
|
|
- `load_best_model_at_end`: True |
|
|
- `ignore_data_skip`: False |
|
|
- `fsdp`: [] |
|
|
- `fsdp_min_num_params`: 0 |
|
|
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False} |
|
|
- `fsdp_transformer_layer_cls_to_wrap`: None |
|
|
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None} |
|
|
- `parallelism_config`: None |
|
|
- `deepspeed`: None |
|
|
- `label_smoothing_factor`: 0.0 |
|
|
- `optim`: adamw_torch |
|
|
- `optim_args`: None |
|
|
- `adafactor`: False |
|
|
- `group_by_length`: False |
|
|
- `length_column_name`: length |
|
|
- `project`: huggingface |
|
|
- `trackio_space_id`: trackio |
|
|
- `ddp_find_unused_parameters`: None |
|
|
- `ddp_bucket_cap_mb`: None |
|
|
- `ddp_broadcast_buffers`: False |
|
|
- `dataloader_pin_memory`: True |
|
|
- `dataloader_persistent_workers`: False |
|
|
- `skip_memory_metrics`: True |
|
|
- `use_legacy_prediction_loop`: False |
|
|
- `push_to_hub`: False |
|
|
- `resume_from_checkpoint`: None |
|
|
- `hub_model_id`: None |
|
|
- `hub_strategy`: every_save |
|
|
- `hub_private_repo`: None |
|
|
- `hub_always_push`: False |
|
|
- `hub_revision`: None |
|
|
- `gradient_checkpointing`: False |
|
|
- `gradient_checkpointing_kwargs`: None |
|
|
- `include_inputs_for_metrics`: False |
|
|
- `include_for_metrics`: [] |
|
|
- `eval_do_concat_batches`: True |
|
|
- `fp16_backend`: auto |
|
|
- `push_to_hub_model_id`: None |
|
|
- `push_to_hub_organization`: None |
|
|
- `mp_parameters`: |
|
|
- `auto_find_batch_size`: False |
|
|
- `full_determinism`: False |
|
|
- `torchdynamo`: None |
|
|
- `ray_scope`: last |
|
|
- `ddp_timeout`: 1800 |
|
|
- `torch_compile`: False |
|
|
- `torch_compile_backend`: None |
|
|
- `torch_compile_mode`: None |
|
|
- `include_tokens_per_second`: False |
|
|
- `include_num_input_tokens_seen`: no |
|
|
- `neftune_noise_alpha`: None |
|
|
- `optim_target_modules`: None |
|
|
- `batch_eval_metrics`: False |
|
|
- `eval_on_start`: False |
|
|
- `use_liger_kernel`: False |
|
|
- `liger_kernel_config`: None |
|
|
- `eval_use_gather_object`: False |
|
|
- `average_tokens_across_devices`: True |
|
|
- `prompts`: None |
|
|
- `batch_sampler`: no_duplicates |
|
|
- `multi_dataset_batch_sampler`: proportional |
|
|
- `router_mapping`: {} |
|
|
- `learning_rate_mapping`: {} |
|
|
|
|
|
</details> |
|
|
|
|
|
### Training Logs |
|
|
| Epoch | Step | dim_768_cosine_ndcg@10 | dim_512_cosine_ndcg@10 | dim_256_cosine_ndcg@10 | dim_128_cosine_ndcg@10 | dim_64_cosine_ndcg@10 | |
|
|
|:-------:|:-----:|:----------------------:|:----------------------:|:----------------------:|:----------------------:|:---------------------:| |
|
|
| 1.0 | 3 | 0.9409 | 0.9202 | 0.9431 | 0.8412 | 0.9059 | |
|
|
| **2.0** | **6** | **0.9409** | **0.9409** | **0.9409** | **0.8943** | **0.9074** | |
|
|
|
|
|
* The bold row denotes the saved checkpoint. |
|
|
|
|
|
### Framework Versions |
|
|
- Python: 3.14.0 |
|
|
- Sentence Transformers: 5.2.2 |
|
|
- Transformers: 4.57.3 |
|
|
- PyTorch: 2.9.1 |
|
|
- Accelerate: 1.12.0 |
|
|
- Datasets: 4.5.0 |
|
|
- Tokenizers: 0.22.2 |
|
|
|
|
|
## Citation |
|
|
|
|
|
### BibTeX |
|
|
|
|
|
#### Sentence Transformers |
|
|
```bibtex |
|
|
@inproceedings{reimers-2019-sentence-bert, |
|
|
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", |
|
|
author = "Reimers, Nils and Gurevych, Iryna", |
|
|
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing", |
|
|
month = "11", |
|
|
year = "2019", |
|
|
publisher = "Association for Computational Linguistics", |
|
|
url = "https://arxiv.org/abs/1908.10084", |
|
|
} |
|
|
``` |
|
|
|
|
|
#### MatryoshkaLoss |
|
|
```bibtex |
|
|
@misc{kusupati2024matryoshka, |
|
|
title={Matryoshka Representation Learning}, |
|
|
author={Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi}, |
|
|
year={2024}, |
|
|
eprint={2205.13147}, |
|
|
archivePrefix={arXiv}, |
|
|
primaryClass={cs.LG} |
|
|
} |
|
|
``` |
|
|
|
|
|
#### MultipleNegativesRankingLoss |
|
|
```bibtex |
|
|
@misc{henderson2017efficient, |
|
|
title={Efficient Natural Language Response Suggestion for Smart Reply}, |
|
|
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil}, |
|
|
year={2017}, |
|
|
eprint={1705.00652}, |
|
|
archivePrefix={arXiv}, |
|
|
primaryClass={cs.CL} |
|
|
} |
|
|
``` |
|
|
|
|
|
<!-- |
|
|
## Glossary |
|
|
|
|
|
*Clearly define terms in order to be accessible across audiences.* |
|
|
--> |
|
|
|
|
|
<!-- |
|
|
## Model Card Authors |
|
|
|
|
|
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.* |
|
|
--> |
|
|
|
|
|
<!-- |
|
|
## Model Card Contact |
|
|
|
|
|
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.* |
|
|
--> |