File size: 6,424 Bytes
fe0fc54 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | <?php
declare(strict_types=1);
class Search
{
private Database $db;
public function __construct()
{
$this->db = Database::getInstance();
}
/**
* Full-text search across page OCR content.
*
* Uses PostgreSQL plainto_tsquery for safe, automatic query parsing.
* Returns matching pages with document metadata, a highlighted snippet,
* and a relevance rank.
*
* @param string $query Raw search terms
* @param string|null $collection Optional source_section filter
* @param int $limit Max results
* @param int $offset Pagination offset
* @return array<int, array<string, mixed>>
*/
public function fullText(
string $query,
?string $collection = null,
int $limit = 50,
int $offset = 0,
?string $topicFilter = null,
?string $stampFilter = null
): array {
if (trim($query) === '') {
return [];
}
$params = ['query' => $query];
$joins = '';
$extraWhere = '';
if ($collection !== null && $collection !== '') {
$extraWhere .= ' AND d.source_section = :collection';
$params['collection'] = $collection;
}
if ($topicFilter !== null && $topicFilter !== '') {
$joins .= " INNER JOIN document_features df_topic ON df_topic.document_id = d.id
AND df_topic.feature_name = 'topic_distribution'
AND (df_topic.feature_json->>:topicKey)::float > 0.5";
$params['topicKey'] = $topicFilter;
}
if ($stampFilter !== null && $stampFilter !== '') {
$joins .= " INNER JOIN document_features df_stamp ON df_stamp.document_id = d.id
AND df_stamp.feature_name = 'forensic_metadata'
AND EXISTS (SELECT 1 FROM jsonb_array_elements(df_stamp.feature_json->'stamps') s WHERE s->>'stamp' = :stampVal)";
$params['stampVal'] = $stampFilter;
}
$params['limit'] = $limit;
$params['offset'] = $offset;
$sql = "
SELECT
p.id AS page_id,
p.document_id,
p.page_number,
p.ocr_confidence,
p.word_count,
d.file_path,
d.source_section,
d.total_pages,
ts_headline(
'english',
p.ocr_text,
plainto_tsquery('english', :query),
'StartSel=<mark>, StopSel=</mark>, MaxWords=60, MinWords=20, MaxFragments=3, FragmentDelimiter= ... '
) AS snippet,
ts_rank_cd(p.tsv, plainto_tsquery('english', :query)) AS rank
FROM pages p
INNER JOIN documents d ON d.id = p.document_id
{$joins}
WHERE p.tsv @@ plainto_tsquery('english', :query)
{$extraWhere}
ORDER BY rank DESC, p.document_id, p.page_number
LIMIT :limit OFFSET :offset
";
return $this->db->fetchAll($sql, $params);
}
/**
* Semantic (vector) similarity search.
*
* Uses pgvector cosine distance to find pages with embeddings most
* similar to the given page's embedding.
*
* @param int $pageId The source page whose embedding to compare against
* @param int $limit Max number of similar pages to return
* @return array<int, array<string, mixed>>
*/
public function semantic(int $pageId, int $limit = 20): array
{
$sql = "
SELECT
p2.id AS page_id,
p2.document_id,
p2.page_number,
p2.word_count,
p2.ocr_confidence,
d.file_path,
d.source_section,
d.total_pages,
(p1.embedding <=> p2.embedding) AS distance
FROM pages p1
CROSS JOIN LATERAL (
SELECT p.*
FROM pages p
WHERE p.id != p1.id
AND p.embedding IS NOT NULL
ORDER BY p1.embedding <=> p.embedding
LIMIT :limit
) p2
INNER JOIN documents d ON d.id = p2.document_id
WHERE p1.id = :pageId
AND p1.embedding IS NOT NULL
ORDER BY distance ASC
";
return $this->db->fetchAll($sql, [
'pageId' => $pageId,
'limit' => $limit,
]);
}
/**
* Count the total number of full-text search results (for pagination).
*
* @param string $query
* @param string|null $collection
* @return int
*/
public function countResults(string $query, ?string $collection = null, ?string $topicFilter = null, ?string $stampFilter = null): int
{
if (trim($query) === '') {
return 0;
}
$params = ['query' => $query];
$joins = '';
$extraWhere = '';
if ($collection !== null && $collection !== '') {
$extraWhere .= ' AND d.source_section = :collection';
$params['collection'] = $collection;
}
if ($topicFilter !== null && $topicFilter !== '') {
$joins .= " INNER JOIN document_features df_topic ON df_topic.document_id = d.id
AND df_topic.feature_name = 'topic_distribution'
AND (df_topic.feature_json->>:topicKey)::float > 0.5";
$params['topicKey'] = $topicFilter;
}
if ($stampFilter !== null && $stampFilter !== '') {
$joins .= " INNER JOIN document_features df_stamp ON df_stamp.document_id = d.id
AND df_stamp.feature_name = 'forensic_metadata'
AND EXISTS (SELECT 1 FROM jsonb_array_elements(df_stamp.feature_json->'stamps') s WHERE s->>'stamp' = :stampVal)";
$params['stampVal'] = $stampFilter;
}
$sql = "
SELECT COUNT(*)::int AS cnt
FROM pages p
INNER JOIN documents d ON d.id = p.document_id
{$joins}
WHERE p.tsv @@ plainto_tsquery('english', :query)
{$extraWhere}
";
$row = $this->db->fetchOne($sql, $params);
return $row ? (int) $row['cnt'] : 0;
}
}
|