Spaces:

CandidAI
/

ask-candid

Running

App Files Files Community

brainsqueeze commited on Sep 29

Commit

fc7f1bb

verified ·

1 Parent(s): f002446

Delete ask_candid/base/retrieval/knowledge_base.py

Browse files

Files changed (1) hide show

ask_candid/base/retrieval/knowledge_base.py +0 -362

ask_candid/base/retrieval/knowledge_base.py DELETED Viewed

@@ -1,362 +0,0 @@
-from typing import Literal, Any
-from collections.abc import Iterator, Iterable
-from itertools import groupby
-import logging
-from langchain_core.documents import Document
-from ask_candid.base.retrieval.elastic import (
-    build_sparse_vector_query,
-    build_sparse_vector_and_text_query,
-    news_query_builder,
-    multi_search_base
-)
-from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
-from ask_candid.base.retrieval.schemas import ElasticHitsResult
-import ask_candid.base.retrieval.sources as S
-from ask_candid.services.small_lm import CandidSLM
-from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
-SourceNames = Literal[
-    "Candid Blog",
-    "Candid Help",
-    "Candid Learning",
-    "Candid News",
-    "IssueLab Research Reports",
-    "YouTube Training"
-]
-sparse_encoder = SpladeEncoder()
-logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-# TODO remove
-def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
-    """Pads the relevant chunk of text with context before and after
-    Parameters
-    ----------
-    field_name : str
-        a field with the long text that was chunked into pieces
-    hit : ElasticHitsResult
-    context_length : int, optional
-        length of text to add before and after the chunk, by default 1024
-    add_context : bool, optional
-        Set to `False` to expand the text context by searching for the Elastic inner hit inside the larger document
-        , by default True
-    Returns
-    -------
-    str
-        longer chunks stuffed together
-    """
-    chunks = []
-    # NOTE chunks have tokens, long text is a string, but may contain html which affects tokenization
-    long_text = hit.source.get(field_name) or ""
-    long_text = long_text.lower()
-    inner_hits_field = f"embeddings.{field_name}.chunks"
-    found_chunks = hit.inner_hits.get(inner_hits_field, {}) if hit.inner_hits else None
-    if found_chunks:
-        for h in found_chunks.get("hits", {}).get("hits") or []:
-            chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
-            # cutting the middle because we may have tokenizing artifacts there
-            chunk = chunk[3: -3]
-            if add_context:
-                # Find the start and end indices of the chunk in the large text
-                start_index = long_text.find(chunk[:20])
-                # Chunk is found
-                if start_index != -1:
-                    end_index = start_index + len(chunk)
-                    pre_start_index = max(0, start_index - context_length)
-                    post_end_index = min(len(long_text), end_index + context_length)
-                    chunks.append(long_text[pre_start_index:post_end_index])
-            else:
-                chunks.append(chunk)
-    return '\n\n'.join(chunks)
-def generate_queries(
-    query: str,
-    sources: list[SourceNames],
-    news_days_ago: int = 60
-) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
-    """Builds Elastic queries against indices which do or do not support sparse vector queries.
-    Parameters
-    ----------
-    query : str
-        Text describing a user's question or a description of investigative work which requires support from Candid's
-        knowledge base
-    sources : list[SourceNames]
-        One or more sources of knowledge from different areas at Candid.
-        * Candid Blog: Blog posts from Candid staff and trusted partners intended to help those in the sector or
-        illuminate ongoing work
-        * Candid Help: Candid FAQs to help user's get started with Candid's product platform and learning resources
-        * Candid Learning: Training documents from Candid's subject matter experts
-        * Candid News: News articles and press releases about real-time activity in the philanthropic sector
-        * IssueLab Research Reports: Academic research reports about the social/philanthropic sector
-        * YouTube Training: Transcripts from video-based training seminars from Candid's subject matter experts
-    news_days_ago : int, optional
-        How many days in the past to search for news articles, if a user is asking for recent trends then this value
-        should be set lower >~ 10, by default 60
-    Returns
-    -------
-    tuple[list[dict[str, Any]], list[dict[str, Any]]]
-        (sparse vector queries, queries for indices which do not support sparse vectors)
-    """
-    vector_queries = []
-    quasi_vector_queries = []
-    for source_name in sources:
-        if source_name == "Candid Blog":
-            q = build_sparse_vector_query(query=query, fields=S.CandidBlogConfig.semantic_fields)
-            q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 5
-            vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
-        elif source_name == "Candid Help":
-            q = build_sparse_vector_query(query=query, fields=S.CandidHelpConfig.semantic_fields)
-            q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 5
-            vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
-        elif source_name == "Candid Learning":
-            q = build_sparse_vector_query(query=query, fields=S.CandidLearningConfig.semantic_fields)
-            q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 5
-            vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
-        elif source_name == "Candid News":
-            q = news_query_builder(
-                query=query,
-                fields=S.CandidNewsConfig.semantic_fields,
-                encoder=sparse_encoder,
-                days_ago=news_days_ago
-            )
-            q["size"] = 5
-            quasi_vector_queries.extend([{"index": S.CandidNewsConfig.index_name}, q])
-        elif source_name == "IssueLab Research Reports":
-            q = build_sparse_vector_query(query=query, fields=S.IssueLabConfig.semantic_fields)
-            q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 1
-            vector_queries.extend([{"index": S.IssueLabConfig.index_name}, q])
-        elif source_name == "YouTube Training":
-            q = build_sparse_vector_and_text_query(
-                query=query,
-                semantic_fields=S.YoutubeConfig.semantic_fields,
-                text_fields=S.YoutubeConfig.text_fields,
-                highlight_fields=S.YoutubeConfig.highlight_fields,
-                excluded_fields=S.YoutubeConfig.excluded_fields
-            )
-            q["size"] = 5
-            vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
-    return vector_queries, quasi_vector_queries
-def run_search(
-    vector_searches: list[dict[str, Any]] | None = None,
-    non_vector_searches: list[dict[str, Any]] | None = None,
-) -> list[ElasticHitsResult]:
-    def _msearch_response_generator(responses: Iterable[dict[str, Any]]) -> Iterator[ElasticHitsResult]:
-        for query_group in responses:
-            for h in query_group.get("hits", {}).get("hits", []):
-                inner_hits = h.get("inner_hits", {})
-                if not inner_hits and "news" in h.get("_index"):
-                    inner_hits = {"text": h.get("_source", {}).get("content")}
-                yield ElasticHitsResult(
-                    index=h["_index"],
-                    id=h["_id"],
-                    score=h["_score"],
-                    source=h["_source"],
-                    inner_hits=inner_hits,
-                    highlight=h.get("highlight", {})
-                )
-    results = []
-    if vector_searches is not None and len(vector_searches) > 0:
-        hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC_QA)
-        for hit in _msearch_response_generator(responses=hits):
-            results.append(hit)
-    if non_vector_searches is not None and len(non_vector_searches) > 0:
-        hits = multi_search_base(queries=non_vector_searches, credentials=NEWS_ELASTIC)
-        for hit in _msearch_response_generator(responses=hits):
-            results.append(hit)
-    return results
-def retrieved_text(hits: dict[str, Any]) -> str:
-    """Extracts retrieved sub-texts from documents which are strong hits from semantic queries for the purpose of
-    re-scoring by a secondary language model.
-    Parameters
-    ----------
-    hits : Dict[str, Any]
-    Returns
-    -------
-    str
-    """
-    nlp = CandidSLM()
-    text = []
-    for _, v in hits.items():
-        if _ == "text":
-            s = nlp.summarize(v, top_k=3)
-            text.append(s.summary)
-            # text.append(v)
-            continue
-        for h in (v.get("hits", {}).get("hits") or []):
-            for _, field in h.get("fields", {}).items():
-                for chunk in field:
-                    if chunk.get("chunk"):
-                        text.extend(chunk["chunk"])
-    return '\n'.join(text)
-def reranker(
-    query_results: Iterable[ElasticHitsResult],
-    search_text: str | None = None,
-    max_num_results: int = 5
-) -> Iterator[ElasticHitsResult]:
-    """Reranks Elasticsearch hits coming from multiple indices/queries which may have scores on different scales.
-    This will shuffle results
-    Parameters
-    ----------
-    query_results : Iterable[ElasticHitsResult]
-    Yields
-    ------
-    Iterator[ElasticHitsResult]
-    """
-    results: list[ElasticHitsResult] = []
-    texts: list[str] = []
-    for _, data in groupby(query_results, key=lambda x: x.index):
-        data = list(data)  # noqa: PLW2901
-        max_score = max(data, key=lambda x: x.score).score
-        min_score = min(data, key=lambda x: x.score).score
-        for d in data:
-            d.score = (d.score - min_score) / (max_score - min_score + 1e-9)
-            results.append(d)
-            if search_text:
-                if d.inner_hits:
-                    text = retrieved_text(d.inner_hits)
-                if d.highlight:
-                    highlight_texts = []
-                    for k,v in d.highlight.items():
-                        v_text = '\n'.join(v)
-                        highlight_texts.append(v_text)
-                    text = '\n'.join(highlight_texts)
-                texts.append(text)
-    if search_text and len(texts) == len(results) and len(texts) > 1:
-        logger.info("Re-ranking %d retrieval results", len(results))
-        scores = sparse_encoder.query_reranking(query=search_text, documents=texts)
-        for r, s in zip(results, scores):
-            r.score = s
-    yield from sorted(results, key=lambda x: x.score, reverse=True)[:max_num_results]
-def process_hit(hit: ElasticHitsResult) -> Document:
-    if "issuelab-elser" in hit.index:
-        doc = Document(
-            page_content='\n\n'.join([
-                hit.source.get("combined_item_description", ""),
-                hit.source.get("description", ""),
-                hit.source.get("combined_issuelab_findings", ""),
-                get_context("content", hit, context_length=12)
-            ]),
-            metadata={
-                "title": hit.source["title"],
-                "source": "IssueLab",
-                "source_id": hit.source["resource_id"],
-                "url": hit.source.get("permalink", "")
-            }
-        )
-    elif "youtube" in hit.index:
-        highlight = hit.highlight or {}
-        doc = Document(
-            page_content='\n\n'.join([
-                hit.source.get("title", ""),
-                hit.source.get("semantic_description", ""),
-                ' '.join(highlight.get("semantic_cc_text", []))
-            ]),
-            metadata={
-                "title": hit.source.get("title", ""),
-                "source": "Candid YouTube",
-                "source_id": hit.source['video_id'],
-                "url": f"https://www.youtube.com/watch?v={hit.source['video_id']}"
-            }
-        )
-    elif "candid-blog" in hit.index:
-        doc = Document(
-            page_content='\n\n'.join([
-                hit.source.get("title", ""),
-                hit.source.get("excerpt", ""),
-                get_context("content", hit, context_length=12, add_context=False),
-                get_context("authors_text", hit, context_length=12, add_context=False),
-                hit.source.get("title_summary_tags", "")
-            ]),
-            metadata={
-                "title": hit.source.get("title", ""),
-                "source": "Candid Blog",
-                "source_id": hit.source["id"],
-                "url": hit.source["link"]
-            }
-        )
-    elif "candid-learning" in hit.index:
-        doc = Document(
-            page_content='\n\n'.join([
-                hit.source.get("title", ""),
-                hit.source.get("staff_recommendations", ""),
-                hit.source.get("training_topics", ""),
-                get_context("content", hit, context_length=12)
-            ]),
-            metadata={
-                "title": hit.source["title"],
-                "source": "Candid Learning",
-                "source_id": hit.source["post_id"],
-                "url": hit.source.get("url", "")
-            }
-        )
-    elif "candid-help" in hit.index:
-        doc = Document(
-            page_content='\n\n'.join([
-                hit.source.get("combined_article_description", ""),
-                get_context("content", hit, context_length=12)
-            ]),
-            metadata={
-                "title": hit.source.get("title", ""),
-                "source": "Candid Help",
-                "source_id": hit.source["id"],
-                "url": hit.source.get("link", "")
-            }
-        )
-    elif "news" in hit.index:
-        doc = Document(
-            page_content='\n\n'.join([hit.source.get("title", ""), hit.source.get("content", "")]),
-            metadata={
-                "title": hit.source.get("title", ""),
-                "source": hit.source.get("site_name") or "Candid News",
-                "source_id": hit.source["id"],
-                "url": hit.source.get("link", "")
-            }
-        )
-    else:
-        raise ValueError(f"Unknown source result from index {hit.index}")
-    return doc