datamatters24 commited on
Commit
fe0fc54
·
verified ·
1 Parent(s): 9a33c82

Upload web/src/models/Search.php with huggingface_hub

Browse files
Files changed (1) hide show
  1. web/src/models/Search.php +187 -0
web/src/models/Search.php ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?php
2
+
3
+ declare(strict_types=1);
4
+
5
+ class Search
6
+ {
7
+ private Database $db;
8
+
9
+ public function __construct()
10
+ {
11
+ $this->db = Database::getInstance();
12
+ }
13
+
14
+ /**
15
+ * Full-text search across page OCR content.
16
+ *
17
+ * Uses PostgreSQL plainto_tsquery for safe, automatic query parsing.
18
+ * Returns matching pages with document metadata, a highlighted snippet,
19
+ * and a relevance rank.
20
+ *
21
+ * @param string $query Raw search terms
22
+ * @param string|null $collection Optional source_section filter
23
+ * @param int $limit Max results
24
+ * @param int $offset Pagination offset
25
+ * @return array<int, array<string, mixed>>
26
+ */
27
+ public function fullText(
28
+ string $query,
29
+ ?string $collection = null,
30
+ int $limit = 50,
31
+ int $offset = 0,
32
+ ?string $topicFilter = null,
33
+ ?string $stampFilter = null
34
+ ): array {
35
+ if (trim($query) === '') {
36
+ return [];
37
+ }
38
+
39
+ $params = ['query' => $query];
40
+ $joins = '';
41
+ $extraWhere = '';
42
+
43
+ if ($collection !== null && $collection !== '') {
44
+ $extraWhere .= ' AND d.source_section = :collection';
45
+ $params['collection'] = $collection;
46
+ }
47
+
48
+ if ($topicFilter !== null && $topicFilter !== '') {
49
+ $joins .= " INNER JOIN document_features df_topic ON df_topic.document_id = d.id
50
+ AND df_topic.feature_name = 'topic_distribution'
51
+ AND (df_topic.feature_json->>:topicKey)::float > 0.5";
52
+ $params['topicKey'] = $topicFilter;
53
+ }
54
+
55
+ if ($stampFilter !== null && $stampFilter !== '') {
56
+ $joins .= " INNER JOIN document_features df_stamp ON df_stamp.document_id = d.id
57
+ AND df_stamp.feature_name = 'forensic_metadata'
58
+ AND EXISTS (SELECT 1 FROM jsonb_array_elements(df_stamp.feature_json->'stamps') s WHERE s->>'stamp' = :stampVal)";
59
+ $params['stampVal'] = $stampFilter;
60
+ }
61
+
62
+ $params['limit'] = $limit;
63
+ $params['offset'] = $offset;
64
+
65
+ $sql = "
66
+ SELECT
67
+ p.id AS page_id,
68
+ p.document_id,
69
+ p.page_number,
70
+ p.ocr_confidence,
71
+ p.word_count,
72
+ d.file_path,
73
+ d.source_section,
74
+ d.total_pages,
75
+ ts_headline(
76
+ 'english',
77
+ p.ocr_text,
78
+ plainto_tsquery('english', :query),
79
+ 'StartSel=<mark>, StopSel=</mark>, MaxWords=60, MinWords=20, MaxFragments=3, FragmentDelimiter= ... '
80
+ ) AS snippet,
81
+ ts_rank_cd(p.tsv, plainto_tsquery('english', :query)) AS rank
82
+ FROM pages p
83
+ INNER JOIN documents d ON d.id = p.document_id
84
+ {$joins}
85
+ WHERE p.tsv @@ plainto_tsquery('english', :query)
86
+ {$extraWhere}
87
+ ORDER BY rank DESC, p.document_id, p.page_number
88
+ LIMIT :limit OFFSET :offset
89
+ ";
90
+
91
+ return $this->db->fetchAll($sql, $params);
92
+ }
93
+
94
+ /**
95
+ * Semantic (vector) similarity search.
96
+ *
97
+ * Uses pgvector cosine distance to find pages with embeddings most
98
+ * similar to the given page's embedding.
99
+ *
100
+ * @param int $pageId The source page whose embedding to compare against
101
+ * @param int $limit Max number of similar pages to return
102
+ * @return array<int, array<string, mixed>>
103
+ */
104
+ public function semantic(int $pageId, int $limit = 20): array
105
+ {
106
+ $sql = "
107
+ SELECT
108
+ p2.id AS page_id,
109
+ p2.document_id,
110
+ p2.page_number,
111
+ p2.word_count,
112
+ p2.ocr_confidence,
113
+ d.file_path,
114
+ d.source_section,
115
+ d.total_pages,
116
+ (p1.embedding <=> p2.embedding) AS distance
117
+ FROM pages p1
118
+ CROSS JOIN LATERAL (
119
+ SELECT p.*
120
+ FROM pages p
121
+ WHERE p.id != p1.id
122
+ AND p.embedding IS NOT NULL
123
+ ORDER BY p1.embedding <=> p.embedding
124
+ LIMIT :limit
125
+ ) p2
126
+ INNER JOIN documents d ON d.id = p2.document_id
127
+ WHERE p1.id = :pageId
128
+ AND p1.embedding IS NOT NULL
129
+ ORDER BY distance ASC
130
+ ";
131
+
132
+ return $this->db->fetchAll($sql, [
133
+ 'pageId' => $pageId,
134
+ 'limit' => $limit,
135
+ ]);
136
+ }
137
+
138
+ /**
139
+ * Count the total number of full-text search results (for pagination).
140
+ *
141
+ * @param string $query
142
+ * @param string|null $collection
143
+ * @return int
144
+ */
145
+ public function countResults(string $query, ?string $collection = null, ?string $topicFilter = null, ?string $stampFilter = null): int
146
+ {
147
+ if (trim($query) === '') {
148
+ return 0;
149
+ }
150
+
151
+ $params = ['query' => $query];
152
+ $joins = '';
153
+ $extraWhere = '';
154
+
155
+ if ($collection !== null && $collection !== '') {
156
+ $extraWhere .= ' AND d.source_section = :collection';
157
+ $params['collection'] = $collection;
158
+ }
159
+
160
+ if ($topicFilter !== null && $topicFilter !== '') {
161
+ $joins .= " INNER JOIN document_features df_topic ON df_topic.document_id = d.id
162
+ AND df_topic.feature_name = 'topic_distribution'
163
+ AND (df_topic.feature_json->>:topicKey)::float > 0.5";
164
+ $params['topicKey'] = $topicFilter;
165
+ }
166
+
167
+ if ($stampFilter !== null && $stampFilter !== '') {
168
+ $joins .= " INNER JOIN document_features df_stamp ON df_stamp.document_id = d.id
169
+ AND df_stamp.feature_name = 'forensic_metadata'
170
+ AND EXISTS (SELECT 1 FROM jsonb_array_elements(df_stamp.feature_json->'stamps') s WHERE s->>'stamp' = :stampVal)";
171
+ $params['stampVal'] = $stampFilter;
172
+ }
173
+
174
+ $sql = "
175
+ SELECT COUNT(*)::int AS cnt
176
+ FROM pages p
177
+ INNER JOIN documents d ON d.id = p.document_id
178
+ {$joins}
179
+ WHERE p.tsv @@ plainto_tsquery('english', :query)
180
+ {$extraWhere}
181
+ ";
182
+
183
+ $row = $this->db->fetchOne($sql, $params);
184
+
185
+ return $row ? (int) $row['cnt'] : 0;
186
+ }
187
+ }