Spaces:
Running
Running
v3.1 (#3)
Browse files- Elastic config updates (468a9ee9b833e3d2ced7be5e7c461345dbb22b86)
- Update ES sources config (64b9f1e6dc6b24f397665d22bc12443295b3bcce)
- Update KB service to use new ES indices (08718d549b2a1271c092a0f73eab727a34fc5f84)
ask_candid/base/config/connections.py
CHANGED
|
@@ -20,10 +20,11 @@ class BaseElasticAPIKeyCredential:
|
|
| 20 |
api_key: str | None = field(default_factory=str)
|
| 21 |
|
| 22 |
|
| 23 |
-
|
| 24 |
cloud_id=_load_value("SEMANTIC_ELASTIC_CLOUD_ID"),
|
| 25 |
api_key=_load_value("SEMANTIC_ELASTIC_API_KEY"),
|
| 26 |
)
|
|
|
|
| 27 |
|
| 28 |
SEMANTIC_ELASTIC_QA_WRITER = BaseElasticAPIKeyCredential(
|
| 29 |
cloud_id=_load_value("SEMANTIC_ELASTIC_WRITER_CLOUD_ID"),
|
|
|
|
| 20 |
api_key: str | None = field(default_factory=str)
|
| 21 |
|
| 22 |
|
| 23 |
+
SEMANTIC_ELASTIC = BaseElasticAPIKeyCredential(
|
| 24 |
cloud_id=_load_value("SEMANTIC_ELASTIC_CLOUD_ID"),
|
| 25 |
api_key=_load_value("SEMANTIC_ELASTIC_API_KEY"),
|
| 26 |
)
|
| 27 |
+
ELSER_INFERENCE_ID = _load_value("ELSER_INFERENCE_ID") or "elser_model_2_linux-x86_64_search"
|
| 28 |
|
| 29 |
SEMANTIC_ELASTIC_QA_WRITER = BaseElasticAPIKeyCredential(
|
| 30 |
cloud_id=_load_value("SEMANTIC_ELASTIC_WRITER_CLOUD_ID"),
|
ask_candid/base/retrieval/sources.py
CHANGED
|
@@ -3,7 +3,7 @@ from ask_candid.base.retrieval.schemas import ElasticSourceConfig
|
|
| 3 |
|
| 4 |
CandidBlogConfig = ElasticSourceConfig(
|
| 5 |
index_name="search-semantic-blog",
|
| 6 |
-
semantic_fields=("semantic_title_summary_tags_text", "semantic_authors_text","semantic_content"),
|
| 7 |
text_fields=("title", "summary", "content", "authors_text"),
|
| 8 |
highlight_fields=("semantic_content",),
|
| 9 |
excluded_fields=("content",)
|
|
@@ -11,14 +11,27 @@ CandidBlogConfig = ElasticSourceConfig(
|
|
| 11 |
|
| 12 |
|
| 13 |
CandidHelpConfig = ElasticSourceConfig(
|
| 14 |
-
index_name="search-semantic-
|
| 15 |
-
semantic_fields=("
|
|
|
|
|
|
|
|
|
|
| 16 |
)
|
| 17 |
|
| 18 |
|
| 19 |
CandidLearningConfig = ElasticSourceConfig(
|
| 20 |
-
index_name="search-semantic-
|
| 21 |
-
semantic_fields=("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
)
|
| 23 |
|
| 24 |
|
|
|
|
| 3 |
|
| 4 |
CandidBlogConfig = ElasticSourceConfig(
|
| 5 |
index_name="search-semantic-blog",
|
| 6 |
+
semantic_fields=("semantic_title_summary_tags_text", "semantic_authors_text", "semantic_content"),
|
| 7 |
text_fields=("title", "summary", "content", "authors_text"),
|
| 8 |
highlight_fields=("semantic_content",),
|
| 9 |
excluded_fields=("content",)
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
CandidHelpConfig = ElasticSourceConfig(
|
| 14 |
+
index_name="search-semantic-help",
|
| 15 |
+
semantic_fields=("semantic_content", "semantic_title_summary_question_category"),
|
| 16 |
+
text_fields=("title", "summary", "content_question"),
|
| 17 |
+
highlight_fields=("semantic_content",),
|
| 18 |
+
excluded_fields=("content_html", "content")
|
| 19 |
)
|
| 20 |
|
| 21 |
|
| 22 |
CandidLearningConfig = ElasticSourceConfig(
|
| 23 |
+
index_name="search-semantic-learning",
|
| 24 |
+
semantic_fields=("semantic_title_short_description", "semantic_lessons_description","semantic_lessons_content"),
|
| 25 |
+
text_fields=("title", "short_description", "lesson_list.description", "lessson_content.content"),
|
| 26 |
+
highlight_fields=("semantic_lessons_content",),
|
| 27 |
+
excluded_fields=(
|
| 28 |
+
"lesson_content.content_html",
|
| 29 |
+
"lesson_list.description_html",
|
| 30 |
+
"semantic_lessons_content",
|
| 31 |
+
"semantic_lessons_description",
|
| 32 |
+
"lesson_content.content",
|
| 33 |
+
"lesson_list.description"
|
| 34 |
+
)
|
| 35 |
)
|
| 36 |
|
| 37 |
|
ask_candid/services/knowledge_base.py
CHANGED
|
@@ -6,7 +6,7 @@ import logging
|
|
| 6 |
from langchain_core.documents import Document
|
| 7 |
|
| 8 |
from ask_candid.base.retrieval.elastic import (
|
| 9 |
-
build_sparse_vector_query,
|
| 10 |
build_sparse_vector_and_text_query,
|
| 11 |
news_query_builder,
|
| 12 |
issuelab_query_builder,
|
|
@@ -15,7 +15,7 @@ from ask_candid.base.retrieval.elastic import (
|
|
| 15 |
from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
|
| 16 |
from ask_candid.base.retrieval.schemas import ElasticHitsResult
|
| 17 |
import ask_candid.base.retrieval.sources as S
|
| 18 |
-
from ask_candid.base.config.connections import
|
| 19 |
from ask_candid.services.small_lm import CandidSmallLanguageModel
|
| 20 |
|
| 21 |
SourceNames = Literal[
|
|
@@ -32,7 +32,6 @@ logger = logging.getLogger(__name__)
|
|
| 32 |
logger.setLevel(logging.INFO)
|
| 33 |
|
| 34 |
|
| 35 |
-
# TODO remove
|
| 36 |
def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
|
| 37 |
"""Pads the relevant chunk of text with context before and after
|
| 38 |
|
|
@@ -123,18 +122,31 @@ def generate_queries(
|
|
| 123 |
semantic_fields=S.CandidBlogConfig.semantic_fields,
|
| 124 |
text_fields=S.CandidBlogConfig.text_fields,
|
| 125 |
highlight_fields=S.CandidBlogConfig.highlight_fields,
|
| 126 |
-
excluded_fields=S.CandidBlogConfig.excluded_fields
|
|
|
|
| 127 |
)
|
| 128 |
q["size"] = 5
|
| 129 |
vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
|
| 130 |
elif source_name == "Candid Help":
|
| 131 |
-
q =
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
q["size"] = 5
|
| 134 |
vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
|
| 135 |
elif source_name == "Candid Learning":
|
| 136 |
-
q =
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
q["size"] = 5
|
| 139 |
vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
|
| 140 |
elif source_name == "Candid News":
|
|
@@ -166,7 +178,8 @@ def generate_queries(
|
|
| 166 |
semantic_fields=S.YoutubeConfig.semantic_fields,
|
| 167 |
text_fields=S.YoutubeConfig.text_fields,
|
| 168 |
highlight_fields=S.YoutubeConfig.highlight_fields,
|
| 169 |
-
excluded_fields=S.YoutubeConfig.excluded_fields
|
|
|
|
| 170 |
)
|
| 171 |
q["size"] = 5
|
| 172 |
vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
|
|
@@ -215,7 +228,7 @@ def run_search(
|
|
| 215 |
|
| 216 |
results = []
|
| 217 |
if vector_searches is not None and len(vector_searches) > 0:
|
| 218 |
-
hits = multi_search_base(queries=vector_searches, credentials=
|
| 219 |
for hit in _msearch_response_generator(responses=hits):
|
| 220 |
results.append(hit)
|
| 221 |
if non_vector_searches is not None and len(non_vector_searches) > 0:
|
|
@@ -368,6 +381,7 @@ def process_hit(hit: ElasticHitsResult) -> Document:
|
|
| 368 |
)
|
| 369 |
elif "blog" in hit.index:
|
| 370 |
highlight = hit.highlight or {}
|
|
|
|
| 371 |
doc = Document(
|
| 372 |
page_content='\n\n'.join([
|
| 373 |
hit.source.get("title_summary_tags_text", ""),
|
|
@@ -378,35 +392,35 @@ def process_hit(hit: ElasticHitsResult) -> Document:
|
|
| 378 |
"title": hit.source.get("title", ""),
|
| 379 |
"source": "Candid Blog",
|
| 380 |
"source_id": hit.source["id"],
|
| 381 |
-
"url":
|
| 382 |
}
|
| 383 |
)
|
| 384 |
-
elif "
|
|
|
|
| 385 |
doc = Document(
|
| 386 |
page_content='\n\n'.join([
|
| 387 |
-
hit.source.get("
|
| 388 |
-
|
| 389 |
-
hit.source.get("training_topics", ""),
|
| 390 |
-
get_context("content", hit, context_length=12)
|
| 391 |
]),
|
| 392 |
metadata={
|
| 393 |
"title": hit.source["title"],
|
| 394 |
"source": "Candid Learning",
|
| 395 |
-
"source_id": hit.source["
|
| 396 |
-
"url": hit.source.get("
|
| 397 |
}
|
| 398 |
)
|
| 399 |
-
elif "
|
|
|
|
| 400 |
doc = Document(
|
| 401 |
page_content='\n\n'.join([
|
| 402 |
-
hit.source.get("
|
| 403 |
-
|
| 404 |
]),
|
| 405 |
metadata={
|
| 406 |
"title": hit.source.get("title", ""),
|
| 407 |
"source": "Candid Help",
|
| 408 |
-
"source_id": hit.source["
|
| 409 |
-
"url": hit.source.get("
|
| 410 |
}
|
| 411 |
)
|
| 412 |
elif "news" in hit.index:
|
|
|
|
| 6 |
from langchain_core.documents import Document
|
| 7 |
|
| 8 |
from ask_candid.base.retrieval.elastic import (
|
| 9 |
+
# build_sparse_vector_query,
|
| 10 |
build_sparse_vector_and_text_query,
|
| 11 |
news_query_builder,
|
| 12 |
issuelab_query_builder,
|
|
|
|
| 15 |
from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
|
| 16 |
from ask_candid.base.retrieval.schemas import ElasticHitsResult
|
| 17 |
import ask_candid.base.retrieval.sources as S
|
| 18 |
+
from ask_candid.base.config.connections import SEMANTIC_ELASTIC, ELSER_INFERENCE_ID, NEWS_ELASTIC
|
| 19 |
from ask_candid.services.small_lm import CandidSmallLanguageModel
|
| 20 |
|
| 21 |
SourceNames = Literal[
|
|
|
|
| 32 |
logger.setLevel(logging.INFO)
|
| 33 |
|
| 34 |
|
|
|
|
| 35 |
def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
|
| 36 |
"""Pads the relevant chunk of text with context before and after
|
| 37 |
|
|
|
|
| 122 |
semantic_fields=S.CandidBlogConfig.semantic_fields,
|
| 123 |
text_fields=S.CandidBlogConfig.text_fields,
|
| 124 |
highlight_fields=S.CandidBlogConfig.highlight_fields,
|
| 125 |
+
excluded_fields=S.CandidBlogConfig.excluded_fields,
|
| 126 |
+
inference_id=ELSER_INFERENCE_ID
|
| 127 |
)
|
| 128 |
q["size"] = 5
|
| 129 |
vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
|
| 130 |
elif source_name == "Candid Help":
|
| 131 |
+
q = build_sparse_vector_and_text_query(
|
| 132 |
+
query=query,
|
| 133 |
+
semantic_fields=S.CandidHelpConfig.semantic_fields,
|
| 134 |
+
text_fields=S.CandidHelpConfig.text_fields,
|
| 135 |
+
highlight_fields=S.CandidHelpConfig.highlight_fields,
|
| 136 |
+
excluded_fields=S.CandidHelpConfig.excluded_fields,
|
| 137 |
+
inference_id=ELSER_INFERENCE_ID
|
| 138 |
+
)
|
| 139 |
q["size"] = 5
|
| 140 |
vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
|
| 141 |
elif source_name == "Candid Learning":
|
| 142 |
+
q = build_sparse_vector_and_text_query(
|
| 143 |
+
query=query,
|
| 144 |
+
semantic_fields=S.CandidLearningConfig.semantic_fields,
|
| 145 |
+
text_fields=S.CandidLearningConfig.text_fields,
|
| 146 |
+
highlight_fields=S.CandidLearningConfig.highlight_fields,
|
| 147 |
+
excluded_fields=S.CandidLearningConfig.excluded_fields,
|
| 148 |
+
inference_id=ELSER_INFERENCE_ID
|
| 149 |
+
)
|
| 150 |
q["size"] = 5
|
| 151 |
vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
|
| 152 |
elif source_name == "Candid News":
|
|
|
|
| 178 |
semantic_fields=S.YoutubeConfig.semantic_fields,
|
| 179 |
text_fields=S.YoutubeConfig.text_fields,
|
| 180 |
highlight_fields=S.YoutubeConfig.highlight_fields,
|
| 181 |
+
excluded_fields=S.YoutubeConfig.excluded_fields,
|
| 182 |
+
inference_id=ELSER_INFERENCE_ID
|
| 183 |
)
|
| 184 |
q["size"] = 5
|
| 185 |
vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
|
|
|
|
| 228 |
|
| 229 |
results = []
|
| 230 |
if vector_searches is not None and len(vector_searches) > 0:
|
| 231 |
+
hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC)
|
| 232 |
for hit in _msearch_response_generator(responses=hits):
|
| 233 |
results.append(hit)
|
| 234 |
if non_vector_searches is not None and len(non_vector_searches) > 0:
|
|
|
|
| 381 |
)
|
| 382 |
elif "blog" in hit.index:
|
| 383 |
highlight = hit.highlight or {}
|
| 384 |
+
blog_url = hit.source.get("link", "")
|
| 385 |
doc = Document(
|
| 386 |
page_content='\n\n'.join([
|
| 387 |
hit.source.get("title_summary_tags_text", ""),
|
|
|
|
| 392 |
"title": hit.source.get("title", ""),
|
| 393 |
"source": "Candid Blog",
|
| 394 |
"source_id": hit.source["id"],
|
| 395 |
+
"url": blog_url
|
| 396 |
}
|
| 397 |
)
|
| 398 |
+
elif "learning" in hit.index:
|
| 399 |
+
highlight = hit.highlight or {}
|
| 400 |
doc = Document(
|
| 401 |
page_content='\n\n'.join([
|
| 402 |
+
hit.source.get("semantic_title_short_description", ""),
|
| 403 |
+
' '.join(highlight.get("semantic_lessons_content", []))
|
|
|
|
|
|
|
| 404 |
]),
|
| 405 |
metadata={
|
| 406 |
"title": hit.source["title"],
|
| 407 |
"source": "Candid Learning",
|
| 408 |
+
"source_id": hit.source["course_id"],
|
| 409 |
+
"url": hit.source.get("course_url", "")
|
| 410 |
}
|
| 411 |
)
|
| 412 |
+
elif "help" in hit.index:
|
| 413 |
+
highlight = hit.highlight or {}
|
| 414 |
doc = Document(
|
| 415 |
page_content='\n\n'.join([
|
| 416 |
+
hit.source.get("semantic_title_summary_question_category", ""),
|
| 417 |
+
' '.join(highlight.get("semantic_content", []))
|
| 418 |
]),
|
| 419 |
metadata={
|
| 420 |
"title": hit.source.get("title", ""),
|
| 421 |
"source": "Candid Help",
|
| 422 |
+
"source_id": hit.source["article_id"],
|
| 423 |
+
"url": f"""https://help.candid.org/s/article/{hit.source.get("url", "")}"""
|
| 424 |
}
|
| 425 |
)
|
| 426 |
elif "news" in hit.index:
|