datamatters24's picture
Upload web/src/models/Search.php with huggingface_hub
fe0fc54 verified
<?php
declare(strict_types=1);
class Search
{
private Database $db;
public function __construct()
{
$this->db = Database::getInstance();
}
/**
* Full-text search across page OCR content.
*
* Uses PostgreSQL plainto_tsquery for safe, automatic query parsing.
* Returns matching pages with document metadata, a highlighted snippet,
* and a relevance rank.
*
* @param string $query Raw search terms
* @param string|null $collection Optional source_section filter
* @param int $limit Max results
* @param int $offset Pagination offset
* @return array<int, array<string, mixed>>
*/
public function fullText(
string $query,
?string $collection = null,
int $limit = 50,
int $offset = 0,
?string $topicFilter = null,
?string $stampFilter = null
): array {
if (trim($query) === '') {
return [];
}
$params = ['query' => $query];
$joins = '';
$extraWhere = '';
if ($collection !== null && $collection !== '') {
$extraWhere .= ' AND d.source_section = :collection';
$params['collection'] = $collection;
}
if ($topicFilter !== null && $topicFilter !== '') {
$joins .= " INNER JOIN document_features df_topic ON df_topic.document_id = d.id
AND df_topic.feature_name = 'topic_distribution'
AND (df_topic.feature_json->>:topicKey)::float > 0.5";
$params['topicKey'] = $topicFilter;
}
if ($stampFilter !== null && $stampFilter !== '') {
$joins .= " INNER JOIN document_features df_stamp ON df_stamp.document_id = d.id
AND df_stamp.feature_name = 'forensic_metadata'
AND EXISTS (SELECT 1 FROM jsonb_array_elements(df_stamp.feature_json->'stamps') s WHERE s->>'stamp' = :stampVal)";
$params['stampVal'] = $stampFilter;
}
$params['limit'] = $limit;
$params['offset'] = $offset;
$sql = "
SELECT
p.id AS page_id,
p.document_id,
p.page_number,
p.ocr_confidence,
p.word_count,
d.file_path,
d.source_section,
d.total_pages,
ts_headline(
'english',
p.ocr_text,
plainto_tsquery('english', :query),
'StartSel=<mark>, StopSel=</mark>, MaxWords=60, MinWords=20, MaxFragments=3, FragmentDelimiter= ... '
) AS snippet,
ts_rank_cd(p.tsv, plainto_tsquery('english', :query)) AS rank
FROM pages p
INNER JOIN documents d ON d.id = p.document_id
{$joins}
WHERE p.tsv @@ plainto_tsquery('english', :query)
{$extraWhere}
ORDER BY rank DESC, p.document_id, p.page_number
LIMIT :limit OFFSET :offset
";
return $this->db->fetchAll($sql, $params);
}
/**
* Semantic (vector) similarity search.
*
* Uses pgvector cosine distance to find pages with embeddings most
* similar to the given page's embedding.
*
* @param int $pageId The source page whose embedding to compare against
* @param int $limit Max number of similar pages to return
* @return array<int, array<string, mixed>>
*/
public function semantic(int $pageId, int $limit = 20): array
{
$sql = "
SELECT
p2.id AS page_id,
p2.document_id,
p2.page_number,
p2.word_count,
p2.ocr_confidence,
d.file_path,
d.source_section,
d.total_pages,
(p1.embedding <=> p2.embedding) AS distance
FROM pages p1
CROSS JOIN LATERAL (
SELECT p.*
FROM pages p
WHERE p.id != p1.id
AND p.embedding IS NOT NULL
ORDER BY p1.embedding <=> p.embedding
LIMIT :limit
) p2
INNER JOIN documents d ON d.id = p2.document_id
WHERE p1.id = :pageId
AND p1.embedding IS NOT NULL
ORDER BY distance ASC
";
return $this->db->fetchAll($sql, [
'pageId' => $pageId,
'limit' => $limit,
]);
}
/**
* Count the total number of full-text search results (for pagination).
*
* @param string $query
* @param string|null $collection
* @return int
*/
public function countResults(string $query, ?string $collection = null, ?string $topicFilter = null, ?string $stampFilter = null): int
{
if (trim($query) === '') {
return 0;
}
$params = ['query' => $query];
$joins = '';
$extraWhere = '';
if ($collection !== null && $collection !== '') {
$extraWhere .= ' AND d.source_section = :collection';
$params['collection'] = $collection;
}
if ($topicFilter !== null && $topicFilter !== '') {
$joins .= " INNER JOIN document_features df_topic ON df_topic.document_id = d.id
AND df_topic.feature_name = 'topic_distribution'
AND (df_topic.feature_json->>:topicKey)::float > 0.5";
$params['topicKey'] = $topicFilter;
}
if ($stampFilter !== null && $stampFilter !== '') {
$joins .= " INNER JOIN document_features df_stamp ON df_stamp.document_id = d.id
AND df_stamp.feature_name = 'forensic_metadata'
AND EXISTS (SELECT 1 FROM jsonb_array_elements(df_stamp.feature_json->'stamps') s WHERE s->>'stamp' = :stampVal)";
$params['stampVal'] = $stampFilter;
}
$sql = "
SELECT COUNT(*)::int AS cnt
FROM pages p
INNER JOIN documents d ON d.id = p.document_id
{$joins}
WHERE p.tsv @@ plainto_tsquery('english', :query)
{$extraWhere}
";
$row = $this->db->fetchOne($sql, $params);
return $row ? (int) $row['cnt'] : 0;
}
}