db = Database::getInstance(); } /** * Full-text search across page OCR content. * * Uses PostgreSQL plainto_tsquery for safe, automatic query parsing. * Returns matching pages with document metadata, a highlighted snippet, * and a relevance rank. * * @param string $query Raw search terms * @param string|null $collection Optional source_section filter * @param int $limit Max results * @param int $offset Pagination offset * @return array> */ public function fullText( string $query, ?string $collection = null, int $limit = 50, int $offset = 0, ?string $topicFilter = null, ?string $stampFilter = null ): array { if (trim($query) === '') { return []; } $params = ['query' => $query]; $joins = ''; $extraWhere = ''; if ($collection !== null && $collection !== '') { $extraWhere .= ' AND d.source_section = :collection'; $params['collection'] = $collection; } if ($topicFilter !== null && $topicFilter !== '') { $joins .= " INNER JOIN document_features df_topic ON df_topic.document_id = d.id AND df_topic.feature_name = 'topic_distribution' AND (df_topic.feature_json->>:topicKey)::float > 0.5"; $params['topicKey'] = $topicFilter; } if ($stampFilter !== null && $stampFilter !== '') { $joins .= " INNER JOIN document_features df_stamp ON df_stamp.document_id = d.id AND df_stamp.feature_name = 'forensic_metadata' AND EXISTS (SELECT 1 FROM jsonb_array_elements(df_stamp.feature_json->'stamps') s WHERE s->>'stamp' = :stampVal)"; $params['stampVal'] = $stampFilter; } $params['limit'] = $limit; $params['offset'] = $offset; $sql = " SELECT p.id AS page_id, p.document_id, p.page_number, p.ocr_confidence, p.word_count, d.file_path, d.source_section, d.total_pages, ts_headline( 'english', p.ocr_text, plainto_tsquery('english', :query), 'StartSel=, StopSel=, MaxWords=60, MinWords=20, MaxFragments=3, FragmentDelimiter= ... ' ) AS snippet, ts_rank_cd(p.tsv, plainto_tsquery('english', :query)) AS rank FROM pages p INNER JOIN documents d ON d.id = p.document_id {$joins} WHERE p.tsv @@ plainto_tsquery('english', :query) {$extraWhere} ORDER BY rank DESC, p.document_id, p.page_number LIMIT :limit OFFSET :offset "; return $this->db->fetchAll($sql, $params); } /** * Semantic (vector) similarity search. * * Uses pgvector cosine distance to find pages with embeddings most * similar to the given page's embedding. * * @param int $pageId The source page whose embedding to compare against * @param int $limit Max number of similar pages to return * @return array> */ public function semantic(int $pageId, int $limit = 20): array { $sql = " SELECT p2.id AS page_id, p2.document_id, p2.page_number, p2.word_count, p2.ocr_confidence, d.file_path, d.source_section, d.total_pages, (p1.embedding <=> p2.embedding) AS distance FROM pages p1 CROSS JOIN LATERAL ( SELECT p.* FROM pages p WHERE p.id != p1.id AND p.embedding IS NOT NULL ORDER BY p1.embedding <=> p.embedding LIMIT :limit ) p2 INNER JOIN documents d ON d.id = p2.document_id WHERE p1.id = :pageId AND p1.embedding IS NOT NULL ORDER BY distance ASC "; return $this->db->fetchAll($sql, [ 'pageId' => $pageId, 'limit' => $limit, ]); } /** * Count the total number of full-text search results (for pagination). * * @param string $query * @param string|null $collection * @return int */ public function countResults(string $query, ?string $collection = null, ?string $topicFilter = null, ?string $stampFilter = null): int { if (trim($query) === '') { return 0; } $params = ['query' => $query]; $joins = ''; $extraWhere = ''; if ($collection !== null && $collection !== '') { $extraWhere .= ' AND d.source_section = :collection'; $params['collection'] = $collection; } if ($topicFilter !== null && $topicFilter !== '') { $joins .= " INNER JOIN document_features df_topic ON df_topic.document_id = d.id AND df_topic.feature_name = 'topic_distribution' AND (df_topic.feature_json->>:topicKey)::float > 0.5"; $params['topicKey'] = $topicFilter; } if ($stampFilter !== null && $stampFilter !== '') { $joins .= " INNER JOIN document_features df_stamp ON df_stamp.document_id = d.id AND df_stamp.feature_name = 'forensic_metadata' AND EXISTS (SELECT 1 FROM jsonb_array_elements(df_stamp.feature_json->'stamps') s WHERE s->>'stamp' = :stampVal)"; $params['stampVal'] = $stampFilter; } $sql = " SELECT COUNT(*)::int AS cnt FROM pages p INNER JOIN documents d ON d.id = p.document_id {$joins} WHERE p.tsv @@ plainto_tsquery('english', :query) {$extraWhere} "; $row = $this->db->fetchOne($sql, $params); return $row ? (int) $row['cnt'] : 0; } }