| <?php |
|
|
| declare(strict_types=1); |
|
|
| class Search |
| { |
| private Database $db; |
|
|
| public function __construct() |
| { |
| $this->db = Database::getInstance(); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public function fullText( |
| string $query, |
| ?string $collection = null, |
| int $limit = 50, |
| int $offset = 0, |
| ?string $topicFilter = null, |
| ?string $stampFilter = null |
| ): array { |
| if (trim($query) === '') { |
| return []; |
| } |
|
|
| $params = ['query' => $query]; |
| $joins = ''; |
| $extraWhere = ''; |
|
|
| if ($collection !== null && $collection !== '') { |
| $extraWhere .= ' AND d.source_section = :collection'; |
| $params['collection'] = $collection; |
| } |
|
|
| if ($topicFilter !== null && $topicFilter !== '') { |
| $joins .= " INNER JOIN document_features df_topic ON df_topic.document_id = d.id |
| AND df_topic.feature_name = 'topic_distribution' |
| AND (df_topic.feature_json->>:topicKey)::float > 0.5"; |
| $params['topicKey'] = $topicFilter; |
| } |
|
|
| if ($stampFilter !== null && $stampFilter !== '') { |
| $joins .= " INNER JOIN document_features df_stamp ON df_stamp.document_id = d.id |
| AND df_stamp.feature_name = 'forensic_metadata' |
| AND EXISTS (SELECT 1 FROM jsonb_array_elements(df_stamp.feature_json->'stamps') s WHERE s->>'stamp' = :stampVal)"; |
| $params['stampVal'] = $stampFilter; |
| } |
|
|
| $params['limit'] = $limit; |
| $params['offset'] = $offset; |
|
|
| $sql = " |
| SELECT |
| p.id AS page_id, |
| p.document_id, |
| p.page_number, |
| p.ocr_confidence, |
| p.word_count, |
| d.file_path, |
| d.source_section, |
| d.total_pages, |
| ts_headline( |
| 'english', |
| p.ocr_text, |
| plainto_tsquery('english', :query), |
| 'StartSel=<mark>, StopSel=</mark>, MaxWords=60, MinWords=20, MaxFragments=3, FragmentDelimiter= ... ' |
| ) AS snippet, |
| ts_rank_cd(p.tsv, plainto_tsquery('english', :query)) AS rank |
| FROM pages p |
| INNER JOIN documents d ON d.id = p.document_id |
| {$joins} |
| WHERE p.tsv @@ plainto_tsquery('english', :query) |
| {$extraWhere} |
| ORDER BY rank DESC, p.document_id, p.page_number |
| LIMIT :limit OFFSET :offset |
| "; |
|
|
| return $this->db->fetchAll($sql, $params); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| public function semantic(int $pageId, int $limit = 20): array |
| { |
| $sql = " |
| SELECT |
| p2.id AS page_id, |
| p2.document_id, |
| p2.page_number, |
| p2.word_count, |
| p2.ocr_confidence, |
| d.file_path, |
| d.source_section, |
| d.total_pages, |
| (p1.embedding <=> p2.embedding) AS distance |
| FROM pages p1 |
| CROSS JOIN LATERAL ( |
| SELECT p.* |
| FROM pages p |
| WHERE p.id != p1.id |
| AND p.embedding IS NOT NULL |
| ORDER BY p1.embedding <=> p.embedding |
| LIMIT :limit |
| ) p2 |
| INNER JOIN documents d ON d.id = p2.document_id |
| WHERE p1.id = :pageId |
| AND p1.embedding IS NOT NULL |
| ORDER BY distance ASC |
| "; |
|
|
| return $this->db->fetchAll($sql, [ |
| 'pageId' => $pageId, |
| 'limit' => $limit, |
| ]); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| public function countResults(string $query, ?string $collection = null, ?string $topicFilter = null, ?string $stampFilter = null): int |
| { |
| if (trim($query) === '') { |
| return 0; |
| } |
|
|
| $params = ['query' => $query]; |
| $joins = ''; |
| $extraWhere = ''; |
|
|
| if ($collection !== null && $collection !== '') { |
| $extraWhere .= ' AND d.source_section = :collection'; |
| $params['collection'] = $collection; |
| } |
|
|
| if ($topicFilter !== null && $topicFilter !== '') { |
| $joins .= " INNER JOIN document_features df_topic ON df_topic.document_id = d.id |
| AND df_topic.feature_name = 'topic_distribution' |
| AND (df_topic.feature_json->>:topicKey)::float > 0.5"; |
| $params['topicKey'] = $topicFilter; |
| } |
|
|
| if ($stampFilter !== null && $stampFilter !== '') { |
| $joins .= " INNER JOIN document_features df_stamp ON df_stamp.document_id = d.id |
| AND df_stamp.feature_name = 'forensic_metadata' |
| AND EXISTS (SELECT 1 FROM jsonb_array_elements(df_stamp.feature_json->'stamps') s WHERE s->>'stamp' = :stampVal)"; |
| $params['stampVal'] = $stampFilter; |
| } |
|
|
| $sql = " |
| SELECT COUNT(*)::int AS cnt |
| FROM pages p |
| INNER JOIN documents d ON d.id = p.document_id |
| {$joins} |
| WHERE p.tsv @@ plainto_tsquery('english', :query) |
| {$extraWhere} |
| "; |
|
|
| $row = $this->db->fetchOne($sql, $params); |
|
|
| return $row ? (int) $row['cnt'] : 0; |
| } |
| } |
|
|