Spaces:

VAILL
/

litigation-tracker

Sleeping

File size: 9,728 Bytes

dda5804

"""

Pinecone-backed vectorstore utilities for the AI Litigation Tracker.



Responsibilities:

- Embed case text using OpenAI embeddings.

- Chunk long documents and mean-pool embeddings into a single case vector.

- Upsert vectors into a Pinecone index.

- Run global similarity search for RAG (query_global).

- Look up a single case by normalized docket number or case name (get_case_by_filter).



Metadata stored with each vector includes:

- docket_number, case_name

- court_id, filing_date

- jurisdiction, courtlistener_url, latest_update

- n_docs (document count for the case)

"""

import os
import hashlib
import time
from typing import Dict, List, Optional

from dotenv import load_dotenv
from pinecone import Pinecone
from openai import OpenAI
import tiktoken

# Load environment variables (e.g., OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_INDEX)
load_dotenv()

OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
PINECONE_INDEX = os.getenv("PINECONE_INDEX", "ai-litigation-cases")

# Reuse single clients across calls
_oai = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
_pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

try:
    _index = _pc.Index(PINECONE_INDEX)
except Exception as e:
    # Fail early with a clear message if the index has not been created yet.
    raise RuntimeError(
        f"Pinecone index '{PINECONE_INDEX}' not found. "
        "Run data_updating_scripts.create_pinecone_index first."
    ) from e


# ============================================================
# Internal helpers
# ============================================================
def _norm(s: Optional[str]) -> Optional[str]:
    """

    Normalize strings for case-insensitive lookups.



    Returns lowercased, stripped text or None if input is not a string.

    """
    return s.lower().strip() if isinstance(s, str) else None


def _make_id(court_id: str, docket_number: str) -> str:
    """

    Create a stable, opaque ID for a (court_id, docket_number) pair.



    Uses SHA-1 for compactness and to avoid leaking the raw identifiers in the ID.

    """
    return hashlib.sha1(f"{court_id}|{docket_number}".encode()).hexdigest()


def _chunk_text(txt: str, max_tokens: int = 750) -> List[str]:
    """

    Split text into chunks based on token count using tiktoken.



    Args:

        txt:         Raw text to tokenize and chunk.

        max_tokens:  Maximum tokens per chunk (approximate prompt size control).



    Returns:

        A list of decoded text chunks. Returns [""] for empty input.

    """
    enc = tiktoken.get_encoding("cl100k_base")
    ids = enc.encode(txt or "")
    chunks = [enc.decode(ids[i : i + max_tokens]) for i in range(0, len(ids), max_tokens)]
    return chunks or [""]


def _embed_batch(texts: List[str]) -> List[List[float]]:
    """

    Embed a batch of texts using the configured OpenAI embedding model.



    Implements simple retry with backoff for transient errors.



    Args:

        texts: List of strings to embed.



    Returns:

        List of embedding vectors (one per input string), in order.

    """
    out: List[List[float]] = []
    i = 0
    while i < len(texts):
        batch = texts[i : i + 32]  # modest batch size for reliability
        for attempt in range(4):
            try:
                resp = _oai.embeddings.create(model=OPENAI_EMBED_MODEL, input=batch)
                out.extend([d.embedding for d in resp.data])
                break
            except Exception:
                if attempt == 3:
                    # After 4 attempts, re-raise to surface the failure.
                    raise
                # Simple linear backoff to avoid hammering the API.
                time.sleep(1.5 * (attempt + 1))
        i += 32
    return out


def _mean_pool(vectors: List[List[float]]) -> List[float]:
    """

    Compute the element-wise mean of a list of vectors.



    Used to aggregate chunk-level embeddings into a single case-level embedding.

    """
    if not vectors:
        return []
    d = len(vectors[0])
    acc = [0.0] * d
    for v in vectors:
        for j in range(d):
            acc[j] += v[j]
    return [x / len(vectors) for x in acc]


# ============================================================
# Public API: vector creation + indexing
# ============================================================
def case_to_vector_payload(

    *,

    docket_number: str,

    case_name: str,

    court_id: str,

    filing_date: Optional[str],

    concatenated_plain_text: str,

    extra_meta: Optional[Dict] = None,

):
    """

    Build a Pinecone-ready vector payload for a single case.



    Steps:

    1. Tokenize and chunk the concatenated case text.

    2. Embed each chunk with OpenAI embeddings.

    3. Mean-pool chunk embeddings into a single centroid vector.

    4. Construct a stable ID and attach metadata used for filtering and display.



    Args:

        docket_number:            Raw docket number string.

        case_name:                Case name/title.

        court_id:                 CourtListener slug (e.g., "mdd", "nysd").

        filing_date:              Filing date as a string (e.g., "08302023" or "2023-08-30").

        concatenated_plain_text:  Combined plain text for all documents in the case.

        extra_meta:               Optional dict for additional metadata such as:

                                - n_docs

                                - courtlistener_url

                                - jurisdiction

                                - latest_update



    Returns:

        (stable_id, embedding_vector, metadata_dict)

    """
    chunks = _chunk_text(concatenated_plain_text)
    embs = _embed_batch(chunks)
    centroid = _mean_pool(embs)
    stable_id = _make_id(court_id, docket_number)
    extra_meta = extra_meta or {}

    metadata = {
        "docket_number": docket_number,
        "docket_number_norm": _norm(docket_number),
        "case_name": case_name,
        "case_name_norm": _norm(case_name),
        "court_id": court_id,
        "filing_date": filing_date or "",
        "n_docs": extra_meta.get("n_docs"),
        "courtlistener_url": extra_meta.get("courtlistener_url"),
        "jurisdiction": extra_meta.get("jurisdiction"),
        "latest_update": extra_meta.get("latest_update"),
    }
    # Drop None values so the index metadata stays lean.
    metadata = {k: v for k, v in metadata.items() if v is not None}
    return stable_id, centroid, metadata


def already_indexed(*, court_id: str, docket_number: str) -> bool:
    """

    Check whether a case is already present in the Pinecone index.



    Args:

        court_id:       CourtListener slug.

        docket_number:  Docket number for the case.



    Returns:

        True if an entry with the stable ID exists in the index, else False.

    """
    vid = _make_id(court_id, docket_number)
    res = _index.fetch(ids=[vid])
    vectors = (res or {}).get("vectors") or {}
    return vid in vectors


def upsert_cases(vectors: List[Dict]) -> None:
    """

    Upsert a list of vector payloads into the Pinecone index.



    Args:

        vectors: List of dicts of the form {"id": ..., "values": [...], "metadata": {...}}.



    Notes:

        - Pinecone handles deduplication by ID, so upserts are idempotent.

        - We still batch requests for efficiency and API friendliness.

    """
    for i in range(0, len(vectors), 100):
        _index.upsert(vectors=vectors[i : i + 100])


# ============================================================
# Public API: querying
# ============================================================
def query_global(question: str, top_k: int = 5) -> List[Dict]:
    """

    Run a global semantic search over all cases for a natural language question.



    Args:

        question: Text query to embed and search with.

        top_k:    Maximum number of matches to return.



    Returns:

        A list of dictionaries for each match:

        {

            "score": <similarity score>,

            ...<all stored metadata fields>...

        }

    """
    q = _embed_batch([question])[0]
    res = _index.query(vector=q, top_k=top_k, include_metadata=True)
    return [{"score": m["score"], **m["metadata"]} for m in res.get("matches", [])]


def get_case_by_filter(

    *,

    docket_number: Optional[str] = None,

    case_name: Optional[str] = None,

) -> Optional[Dict]:
    """

    Look up a single case by normalized docket number and/or case name.



    This is primarily used by case-specific Q&A in rag/chains.py.



    Args:

        docket_number: Optional docket string (case-insensitive).

        case_name:     Optional case name string (case-insensitive).



    Returns:

        A metadata dict for the first matching case, or None if no match.

        The returned metadata mirrors what was stored in case_to_vector_payload.

    """
    flt: Dict[str, str] = {}
    if docket_number:
        flt["docket_number_norm"] = _norm(docket_number)
    if case_name:
        flt["case_name_norm"] = _norm(case_name)
    if not flt:
        return None

    # We do a "dummy" query using a zero vector and rely on metadata filter
    # to select the matching case. The dimension must match the embed model.
    dim = 1536 if "3-small" in OPENAI_EMBED_MODEL else 3072
    res = _index.query(
        vector=[0.0] * dim,
        top_k=1,
        include_metadata=True,
        filter=flt,
    )
    matches = res.get("matches", [])
    return matches[0]["metadata"] if matches else None