ask_candid/base/config/connections.py CHANGED
@@ -20,10 +20,11 @@ class BaseElasticAPIKeyCredential:
20
  api_key: str | None = field(default_factory=str)
21
 
22
 
23
- SEMANTIC_ELASTIC_QA = BaseElasticAPIKeyCredential(
24
  cloud_id=_load_value("SEMANTIC_ELASTIC_CLOUD_ID"),
25
  api_key=_load_value("SEMANTIC_ELASTIC_API_KEY"),
26
  )
 
27
 
28
  SEMANTIC_ELASTIC_QA_WRITER = BaseElasticAPIKeyCredential(
29
  cloud_id=_load_value("SEMANTIC_ELASTIC_WRITER_CLOUD_ID"),
 
20
  api_key: str | None = field(default_factory=str)
21
 
22
 
23
+ SEMANTIC_ELASTIC = BaseElasticAPIKeyCredential(
24
  cloud_id=_load_value("SEMANTIC_ELASTIC_CLOUD_ID"),
25
  api_key=_load_value("SEMANTIC_ELASTIC_API_KEY"),
26
  )
27
+ ELSER_INFERENCE_ID = _load_value("ELSER_INFERENCE_ID") or "elser_model_2_linux-x86_64_search"
28
 
29
  SEMANTIC_ELASTIC_QA_WRITER = BaseElasticAPIKeyCredential(
30
  cloud_id=_load_value("SEMANTIC_ELASTIC_WRITER_CLOUD_ID"),
ask_candid/base/retrieval/sources.py CHANGED
@@ -3,7 +3,7 @@ from ask_candid.base.retrieval.schemas import ElasticSourceConfig
3
 
4
  CandidBlogConfig = ElasticSourceConfig(
5
  index_name="search-semantic-blog",
6
- semantic_fields=("semantic_title_summary_tags_text", "semantic_authors_text","semantic_content"),
7
  text_fields=("title", "summary", "content", "authors_text"),
8
  highlight_fields=("semantic_content",),
9
  excluded_fields=("content",)
@@ -11,14 +11,27 @@ CandidBlogConfig = ElasticSourceConfig(
11
 
12
 
13
  CandidHelpConfig = ElasticSourceConfig(
14
- index_name="search-semantic-candid-help-elser_ve1",
15
- semantic_fields=("content", "combined_article_description")
 
 
 
16
  )
17
 
18
 
19
  CandidLearningConfig = ElasticSourceConfig(
20
- index_name="search-semantic-candid-learning_ve1",
21
- semantic_fields=("content", "title", "training_topics", "staff_recommendations")
 
 
 
 
 
 
 
 
 
 
22
  )
23
 
24
 
 
3
 
4
  CandidBlogConfig = ElasticSourceConfig(
5
  index_name="search-semantic-blog",
6
+ semantic_fields=("semantic_title_summary_tags_text", "semantic_authors_text", "semantic_content"),
7
  text_fields=("title", "summary", "content", "authors_text"),
8
  highlight_fields=("semantic_content",),
9
  excluded_fields=("content",)
 
11
 
12
 
13
  CandidHelpConfig = ElasticSourceConfig(
14
+ index_name="search-semantic-help",
15
+ semantic_fields=("semantic_content", "semantic_title_summary_question_category"),
16
+ text_fields=("title", "summary", "content_question"),
17
+ highlight_fields=("semantic_content",),
18
+ excluded_fields=("content_html", "content")
19
  )
20
 
21
 
22
  CandidLearningConfig = ElasticSourceConfig(
23
+ index_name="search-semantic-learning",
24
+ semantic_fields=("semantic_title_short_description", "semantic_lessons_description","semantic_lessons_content"),
25
+ text_fields=("title", "short_description", "lesson_list.description", "lessson_content.content"),
26
+ highlight_fields=("semantic_lessons_content",),
27
+ excluded_fields=(
28
+ "lesson_content.content_html",
29
+ "lesson_list.description_html",
30
+ "semantic_lessons_content",
31
+ "semantic_lessons_description",
32
+ "lesson_content.content",
33
+ "lesson_list.description"
34
+ )
35
  )
36
 
37
 
ask_candid/services/knowledge_base.py CHANGED
@@ -6,7 +6,7 @@ import logging
6
  from langchain_core.documents import Document
7
 
8
  from ask_candid.base.retrieval.elastic import (
9
- build_sparse_vector_query,
10
  build_sparse_vector_and_text_query,
11
  news_query_builder,
12
  issuelab_query_builder,
@@ -15,7 +15,7 @@ from ask_candid.base.retrieval.elastic import (
15
  from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
16
  from ask_candid.base.retrieval.schemas import ElasticHitsResult
17
  import ask_candid.base.retrieval.sources as S
18
- from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
19
  from ask_candid.services.small_lm import CandidSmallLanguageModel
20
 
21
  SourceNames = Literal[
@@ -32,7 +32,6 @@ logger = logging.getLogger(__name__)
32
  logger.setLevel(logging.INFO)
33
 
34
 
35
- # TODO remove
36
  def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
37
  """Pads the relevant chunk of text with context before and after
38
 
@@ -123,18 +122,31 @@ def generate_queries(
123
  semantic_fields=S.CandidBlogConfig.semantic_fields,
124
  text_fields=S.CandidBlogConfig.text_fields,
125
  highlight_fields=S.CandidBlogConfig.highlight_fields,
126
- excluded_fields=S.CandidBlogConfig.excluded_fields
 
127
  )
128
  q["size"] = 5
129
  vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
130
  elif source_name == "Candid Help":
131
- q = build_sparse_vector_query(query=query, fields=S.CandidHelpConfig.semantic_fields)
132
- q["_source"] = {"excludes": ["embeddings"]}
 
 
 
 
 
 
133
  q["size"] = 5
134
  vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
135
  elif source_name == "Candid Learning":
136
- q = build_sparse_vector_query(query=query, fields=S.CandidLearningConfig.semantic_fields)
137
- q["_source"] = {"excludes": ["embeddings"]}
 
 
 
 
 
 
138
  q["size"] = 5
139
  vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
140
  elif source_name == "Candid News":
@@ -166,7 +178,8 @@ def generate_queries(
166
  semantic_fields=S.YoutubeConfig.semantic_fields,
167
  text_fields=S.YoutubeConfig.text_fields,
168
  highlight_fields=S.YoutubeConfig.highlight_fields,
169
- excluded_fields=S.YoutubeConfig.excluded_fields
 
170
  )
171
  q["size"] = 5
172
  vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
@@ -215,7 +228,7 @@ def run_search(
215
 
216
  results = []
217
  if vector_searches is not None and len(vector_searches) > 0:
218
- hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC_QA)
219
  for hit in _msearch_response_generator(responses=hits):
220
  results.append(hit)
221
  if non_vector_searches is not None and len(non_vector_searches) > 0:
@@ -368,6 +381,7 @@ def process_hit(hit: ElasticHitsResult) -> Document:
368
  )
369
  elif "blog" in hit.index:
370
  highlight = hit.highlight or {}
 
371
  doc = Document(
372
  page_content='\n\n'.join([
373
  hit.source.get("title_summary_tags_text", ""),
@@ -378,35 +392,35 @@ def process_hit(hit: ElasticHitsResult) -> Document:
378
  "title": hit.source.get("title", ""),
379
  "source": "Candid Blog",
380
  "source_id": hit.source["id"],
381
- "url": hit.source["link"]
382
  }
383
  )
384
- elif "candid-learning" in hit.index:
 
385
  doc = Document(
386
  page_content='\n\n'.join([
387
- hit.source.get("title", ""),
388
- hit.source.get("staff_recommendations", ""),
389
- hit.source.get("training_topics", ""),
390
- get_context("content", hit, context_length=12)
391
  ]),
392
  metadata={
393
  "title": hit.source["title"],
394
  "source": "Candid Learning",
395
- "source_id": hit.source["post_id"],
396
- "url": hit.source.get("url", "")
397
  }
398
  )
399
- elif "candid-help" in hit.index:
 
400
  doc = Document(
401
  page_content='\n\n'.join([
402
- hit.source.get("combined_article_description", ""),
403
- get_context("content", hit, context_length=12)
404
  ]),
405
  metadata={
406
  "title": hit.source.get("title", ""),
407
  "source": "Candid Help",
408
- "source_id": hit.source["id"],
409
- "url": hit.source.get("link", "")
410
  }
411
  )
412
  elif "news" in hit.index:
 
6
  from langchain_core.documents import Document
7
 
8
  from ask_candid.base.retrieval.elastic import (
9
+ # build_sparse_vector_query,
10
  build_sparse_vector_and_text_query,
11
  news_query_builder,
12
  issuelab_query_builder,
 
15
  from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
16
  from ask_candid.base.retrieval.schemas import ElasticHitsResult
17
  import ask_candid.base.retrieval.sources as S
18
+ from ask_candid.base.config.connections import SEMANTIC_ELASTIC, ELSER_INFERENCE_ID, NEWS_ELASTIC
19
  from ask_candid.services.small_lm import CandidSmallLanguageModel
20
 
21
  SourceNames = Literal[
 
32
  logger.setLevel(logging.INFO)
33
 
34
 
 
35
  def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
36
  """Pads the relevant chunk of text with context before and after
37
 
 
122
  semantic_fields=S.CandidBlogConfig.semantic_fields,
123
  text_fields=S.CandidBlogConfig.text_fields,
124
  highlight_fields=S.CandidBlogConfig.highlight_fields,
125
+ excluded_fields=S.CandidBlogConfig.excluded_fields,
126
+ inference_id=ELSER_INFERENCE_ID
127
  )
128
  q["size"] = 5
129
  vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
130
  elif source_name == "Candid Help":
131
+ q = build_sparse_vector_and_text_query(
132
+ query=query,
133
+ semantic_fields=S.CandidHelpConfig.semantic_fields,
134
+ text_fields=S.CandidHelpConfig.text_fields,
135
+ highlight_fields=S.CandidHelpConfig.highlight_fields,
136
+ excluded_fields=S.CandidHelpConfig.excluded_fields,
137
+ inference_id=ELSER_INFERENCE_ID
138
+ )
139
  q["size"] = 5
140
  vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
141
  elif source_name == "Candid Learning":
142
+ q = build_sparse_vector_and_text_query(
143
+ query=query,
144
+ semantic_fields=S.CandidLearningConfig.semantic_fields,
145
+ text_fields=S.CandidLearningConfig.text_fields,
146
+ highlight_fields=S.CandidLearningConfig.highlight_fields,
147
+ excluded_fields=S.CandidLearningConfig.excluded_fields,
148
+ inference_id=ELSER_INFERENCE_ID
149
+ )
150
  q["size"] = 5
151
  vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
152
  elif source_name == "Candid News":
 
178
  semantic_fields=S.YoutubeConfig.semantic_fields,
179
  text_fields=S.YoutubeConfig.text_fields,
180
  highlight_fields=S.YoutubeConfig.highlight_fields,
181
+ excluded_fields=S.YoutubeConfig.excluded_fields,
182
+ inference_id=ELSER_INFERENCE_ID
183
  )
184
  q["size"] = 5
185
  vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
 
228
 
229
  results = []
230
  if vector_searches is not None and len(vector_searches) > 0:
231
+ hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC)
232
  for hit in _msearch_response_generator(responses=hits):
233
  results.append(hit)
234
  if non_vector_searches is not None and len(non_vector_searches) > 0:
 
381
  )
382
  elif "blog" in hit.index:
383
  highlight = hit.highlight or {}
384
+ blog_url = hit.source.get("link", "")
385
  doc = Document(
386
  page_content='\n\n'.join([
387
  hit.source.get("title_summary_tags_text", ""),
 
392
  "title": hit.source.get("title", ""),
393
  "source": "Candid Blog",
394
  "source_id": hit.source["id"],
395
+ "url": blog_url
396
  }
397
  )
398
+ elif "learning" in hit.index:
399
+ highlight = hit.highlight or {}
400
  doc = Document(
401
  page_content='\n\n'.join([
402
+ hit.source.get("semantic_title_short_description", ""),
403
+ ' '.join(highlight.get("semantic_lessons_content", []))
 
 
404
  ]),
405
  metadata={
406
  "title": hit.source["title"],
407
  "source": "Candid Learning",
408
+ "source_id": hit.source["course_id"],
409
+ "url": hit.source.get("course_url", "")
410
  }
411
  )
412
+ elif "help" in hit.index:
413
+ highlight = hit.highlight or {}
414
  doc = Document(
415
  page_content='\n\n'.join([
416
+ hit.source.get("semantic_title_summary_question_category", ""),
417
+ ' '.join(highlight.get("semantic_content", []))
418
  ]),
419
  metadata={
420
  "title": hit.source.get("title", ""),
421
  "source": "Candid Help",
422
+ "source_id": hit.source["article_id"],
423
+ "url": f"""https://help.candid.org/s/article/{hit.source.get("url", "")}"""
424
  }
425
  )
426
  elif "news" in hit.index: