dasdebanna commited on
Commit
37a70cc
·
1 Parent(s): 59eb9c1

Prepare app for Hugging Face Space (include index files)

Browse files
.env ADDED
File without changes
classified_tickets_phase2.json ADDED
@@ -0,0 +1,724 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "TICKET-245",
4
+ "subject": "Connecting Snowflake to Atlan - required permissions?",
5
+ "body": "Hi team, we're trying to set up our primary Snowflake production database as a new source in Atlan, but the connection keeps failing. We've tried using our standard service account, but it's not working. Our entire BI team is blocked on this integration for a major upcoming project, so it's quite urgent. Could you please provide a definitive list of the exact permissions and credentials needed on the Snowflake side to get this working? Thanks.",
6
+ "classification": {
7
+ "id": "TICKET-245",
8
+ "topic_tags": [
9
+ "Product"
10
+ ],
11
+ "topic_scores": {
12
+ "Product": 0.2536420226097107,
13
+ "Connector": 0.21206863224506378,
14
+ "Sensitive data": 0.15129192173480988,
15
+ "SSO": 0.09911767393350601,
16
+ "How-to": 0.09207943081855774,
17
+ "Best practices": 0.06830797344446182,
18
+ "Glossary": 0.05438947677612305,
19
+ "API/SDK": 0.047063831239938736,
20
+ "Lineage": 0.022039052098989487
21
+ },
22
+ "sentiment": "Angry",
23
+ "priority": "P0"
24
+ }
25
+ },
26
+ {
27
+ "id": "TICKET-246",
28
+ "subject": "Which connectors automatically capture lineage?",
29
+ "body": "Hello, I'm new to Atlan and trying to understand the lineage capabilities. The documentation mentions automatic lineage, but it's not clear which of our connectors (we use Fivetran, dbt, and Tableau) support this out-of-the-box. We need to present a clear picture of our data flow to leadership next week. Can you explain how lineage capture differs for these tools?",
30
+ "classification": {
31
+ "id": "TICKET-246",
32
+ "topic_tags": [
33
+ "Lineage"
34
+ ],
35
+ "topic_scores": {
36
+ "Lineage": 0.7043173909187317,
37
+ "Product": 0.06416956335306168,
38
+ "Sensitive data": 0.05306507274508476,
39
+ "Connector": 0.05121781677007675,
40
+ "How-to": 0.03940143436193466,
41
+ "Best practices": 0.03072594851255417,
42
+ "SSO": 0.02197032794356346,
43
+ "Glossary": 0.018482282757759094,
44
+ "API/SDK": 0.01665017567574978
45
+ },
46
+ "sentiment": "Angry",
47
+ "priority": "P1"
48
+ }
49
+ },
50
+ {
51
+ "id": "TICKET-247",
52
+ "subject": "Deployment of Atlan agent for private data lake",
53
+ "body": "Our primary data lake is hosted on-premise within a secure VPC and is not exposed to the internet. We understand we need to use the Atlan agent for this, but the setup instructions are a bit confusing for our security team. This is a critical source for us, and we can't proceed with our rollout until we get this connected. Can you provide a detailed deployment guide or connect us with a technical expert?",
54
+ "classification": {
55
+ "id": "TICKET-247",
56
+ "topic_tags": [
57
+ "Sensitive data"
58
+ ],
59
+ "topic_scores": {
60
+ "Sensitive data": 0.4692264795303345,
61
+ "How-to": 0.13292741775512695,
62
+ "Product": 0.1230754628777504,
63
+ "SSO": 0.06440062075853348,
64
+ "Connector": 0.058400604873895645,
65
+ "Best practices": 0.05110899731516838,
66
+ "Glossary": 0.04289798066020012,
67
+ "API/SDK": 0.036751993000507355,
68
+ "Lineage": 0.021210432052612305
69
+ },
70
+ "sentiment": "Angry",
71
+ "priority": "P0"
72
+ }
73
+ },
74
+ {
75
+ "id": "TICKET-248",
76
+ "subject": "How to surface sample rows and schema changes?",
77
+ "body": "Hi, we've successfully connected our Redshift cluster, and the assets are showing up. However, my data analysts are asking how they can see sample data or recent schema changes directly within Atlan without having to go back to Redshift. Is this feature available? I feel like I'm missing something obvious.",
78
+ "classification": {
79
+ "id": "TICKET-248",
80
+ "topic_tags": [
81
+ "How-to"
82
+ ],
83
+ "topic_scores": {
84
+ "How-to": 0.5293518900871277,
85
+ "Sensitive data": 0.12791359424591064,
86
+ "Connector": 0.11278416216373444,
87
+ "Product": 0.07702862471342087,
88
+ "Best practices": 0.035295821726322174,
89
+ "SSO": 0.03469147905707359,
90
+ "API/SDK": 0.030026594176888466,
91
+ "Lineage": 0.026599474251270294,
92
+ "Glossary": 0.0263083353638649
93
+ },
94
+ "sentiment": "Angry",
95
+ "priority": "P2"
96
+ }
97
+ },
98
+ {
99
+ "id": "TICKET-249",
100
+ "subject": "Exporting lineage view for a specific table",
101
+ "body": "For our quarterly audit, I need to provide a complete upstream and downstream lineage diagram for our core `fact_orders` table. I can see the lineage perfectly in the UI, but I can't find an option to export this view as an image or PDF. This is a hard requirement from our compliance team and the deadline is approaching fast. Please help!",
102
+ "classification": {
103
+ "id": "TICKET-249",
104
+ "topic_tags": [
105
+ "Lineage"
106
+ ],
107
+ "topic_scores": {
108
+ "Lineage": 0.6261916756629944,
109
+ "Sensitive data": 0.10166331380605698,
110
+ "Product": 0.08675874769687653,
111
+ "How-to": 0.05723349004983902,
112
+ "Best practices": 0.038304269313812256,
113
+ "Connector": 0.025699065998196602,
114
+ "SSO": 0.02333613485097885,
115
+ "Glossary": 0.022767778486013412,
116
+ "API/SDK": 0.018045460805296898
117
+ },
118
+ "sentiment": "Angry",
119
+ "priority": "P0"
120
+ }
121
+ },
122
+ {
123
+ "id": "TICKET-250",
124
+ "subject": "Importing lineage from Airflow jobs",
125
+ "body": "We run hundreds of ETL jobs in Airflow, and we need to see that lineage reflected in Atlan. I've read that Atlan can integrate with Airflow, but how do we configure it to correctly map our DAGs to the specific datasets they are transforming? The current documentation is a bit high-level.",
126
+ "classification": {
127
+ "id": "TICKET-250",
128
+ "topic_tags": [
129
+ "Lineage"
130
+ ],
131
+ "topic_scores": {
132
+ "Lineage": 0.6405794024467468,
133
+ "How-to": 0.07339552789926529,
134
+ "Sensitive data": 0.06700912863016129,
135
+ "Product": 0.04682222753763199,
136
+ "Best practices": 0.03841668739914894,
137
+ "SSO": 0.03800088167190552,
138
+ "API/SDK": 0.03554345667362213,
139
+ "Glossary": 0.03051498532295227,
140
+ "Connector": 0.029717685654759407
141
+ },
142
+ "sentiment": "Positive",
143
+ "priority": "P1"
144
+ }
145
+ },
146
+ {
147
+ "id": "TICKET-251",
148
+ "subject": "Using the Visual Query Builder",
149
+ "body": "I'm a business analyst and not very comfortable with writing complex SQL. I was excited to see the Visual Query Builder in Atlan, but I'm having trouble figuring out how to join multiple tables and save my query for later use. Is there a tutorial or a quick guide you can point me to?",
150
+ "classification": {
151
+ "id": "TICKET-251",
152
+ "topic_tags": [
153
+ "How-to"
154
+ ],
155
+ "topic_scores": {
156
+ "How-to": 0.20408794283866882,
157
+ "Sensitive data": 0.19889409840106964,
158
+ "Best practices": 0.1357269585132599,
159
+ "Product": 0.1232258677482605,
160
+ "Glossary": 0.08558867126703262,
161
+ "Connector": 0.07553622126579285,
162
+ "API/SDK": 0.06800619512796402,
163
+ "SSO": 0.06174485385417938,
164
+ "Lineage": 0.047189224511384964
165
+ },
166
+ "sentiment": "Angry",
167
+ "priority": "P2"
168
+ }
169
+ },
170
+ {
171
+ "id": "TICKET-252",
172
+ "subject": "Programmatic extraction of lineage",
173
+ "body": "Our internal data science team wants to build a custom application that analyzes metadata propagation delays. To do this, we need to programmatically extract lineage data from Atlan via an API. Does the API expose lineage information, and if so, could you provide an example of the endpoint and the structure of the response?",
174
+ "classification": {
175
+ "id": "TICKET-252",
176
+ "topic_tags": [
177
+ "Lineage"
178
+ ],
179
+ "topic_scores": {
180
+ "Lineage": 0.40903380513191223,
181
+ "How-to": 0.12547598779201508,
182
+ "Sensitive data": 0.12143483757972717,
183
+ "Product": 0.12053452432155609,
184
+ "Best practices": 0.0685185045003891,
185
+ "API/SDK": 0.06657088547945023,
186
+ "Glossary": 0.03478899598121643,
187
+ "SSO": 0.029303470626473427,
188
+ "Connector": 0.024339014664292336
189
+ },
190
+ "sentiment": "Angry",
191
+ "priority": "P1"
192
+ }
193
+ },
194
+ {
195
+ "id": "TICKET-253",
196
+ "subject": "Upstream lineage to Snowflake view not working",
197
+ "body": "This is infuriating. We have a critical Snowflake view, `finance.daily_revenue`, that is built from three upstream tables. Atlan is correctly showing the downstream dependencies, but the upstream lineage is completely missing. This makes the view untrustworthy for our analysts. We've re-run the crawler multiple times. What could be causing this? This is a huge problem for us.",
198
+ "classification": {
199
+ "id": "TICKET-253",
200
+ "topic_tags": [
201
+ "Lineage"
202
+ ],
203
+ "topic_scores": {
204
+ "Lineage": 0.6192259192466736,
205
+ "Sensitive data": 0.14630955457687378,
206
+ "Product": 0.08304131031036377,
207
+ "SSO": 0.03572985529899597,
208
+ "Glossary": 0.03130059689283371,
209
+ "How-to": 0.02567395195364952,
210
+ "Best practices": 0.025151418522000313,
211
+ "Connector": 0.018504904583096504,
212
+ "API/SDK": 0.015062461607158184
213
+ },
214
+ "sentiment": "Angry",
215
+ "priority": "P0"
216
+ }
217
+ },
218
+ {
219
+ "id": "TICKET-254",
220
+ "subject": "How to create a business glossary and link terms in bulk?",
221
+ "body": "We are migrating our existing business glossary from a spreadsheet into Atlan. We have over 500 terms. Manually creating each one and linking them to thousands of assets seems impossible. Is there a bulk import feature using CSV or an API to create terms and link them to assets? This is blocking our entire governance initiative.",
222
+ "classification": {
223
+ "id": "TICKET-254",
224
+ "topic_tags": [
225
+ "How-to",
226
+ "Glossary"
227
+ ],
228
+ "topic_scores": {
229
+ "How-to": 0.4522898495197296,
230
+ "Glossary": 0.4390662610530853,
231
+ "Sensitive data": 0.021999819204211235,
232
+ "Product": 0.01799585483968258,
233
+ "SSO": 0.017875080928206444,
234
+ "Best practices": 0.016796141862869263,
235
+ "API/SDK": 0.01395806111395359,
236
+ "Lineage": 0.010493746027350426,
237
+ "Connector": 0.009525217115879059
238
+ },
239
+ "sentiment": "Angry",
240
+ "priority": "P2"
241
+ }
242
+ },
243
+ {
244
+ "id": "TICKET-255",
245
+ "subject": "Creating a custom role for data stewards",
246
+ "body": "I'm trying to set up a custom role for our data stewards. They need permission to edit descriptions and link glossary terms, but they should NOT have permission to run queries or change connection settings. I'm looking at the default roles, but none of them fit perfectly. How can I create a new role with this specific set of permissions?",
247
+ "classification": {
248
+ "id": "TICKET-255",
249
+ "topic_tags": [
250
+ "How-to"
251
+ ],
252
+ "topic_scores": {
253
+ "How-to": 0.25342780351638794,
254
+ "Glossary": 0.2120532989501953,
255
+ "Product": 0.1341620683670044,
256
+ "Sensitive data": 0.08589652925729752,
257
+ "Connector": 0.08091939240694046,
258
+ "Best practices": 0.07565046101808548,
259
+ "API/SDK": 0.05833519995212555,
260
+ "SSO": 0.055307161062955856,
261
+ "Lineage": 0.044248051941394806
262
+ },
263
+ "sentiment": "Angry",
264
+ "priority": "P1"
265
+ }
266
+ },
267
+ {
268
+ "id": "TICKET-256",
269
+ "subject": "Mapping Active Directory groups to Atlan teams",
270
+ "body": "Our company policy requires us to manage all user access through Active Directory groups. We need to map our existing AD groups (e.g., 'data-analyst-finance', 'data-engineer-core') to teams within Atlan to automatically grant the correct permissions. I can't find the settings for this. How is this configured?",
271
+ "classification": {
272
+ "id": "TICKET-256",
273
+ "topic_tags": [
274
+ "How-to"
275
+ ],
276
+ "topic_scores": {
277
+ "How-to": 0.24342772364616394,
278
+ "Sensitive data": 0.19628578424453735,
279
+ "Best practices": 0.12237202376127243,
280
+ "Product": 0.11746875196695328,
281
+ "Glossary": 0.08366748690605164,
282
+ "SSO": 0.077210932970047,
283
+ "Connector": 0.0679486095905304,
284
+ "API/SDK": 0.04821234196424484,
285
+ "Lineage": 0.04340638965368271
286
+ },
287
+ "sentiment": "Angry",
288
+ "priority": "P0"
289
+ }
290
+ },
291
+ {
292
+ "id": "TICKET-257",
293
+ "subject": "RBAC for assets vs. glossaries",
294
+ "body": "I need clarification on how Atlan's role-based access control works. If a user is denied access to a specific Snowflake schema, can they still see the glossary terms that are linked to the tables in that schema? I need to ensure our PII governance is airtight.",
295
+ "classification": {
296
+ "id": "TICKET-257",
297
+ "topic_tags": [
298
+ "Sensitive data",
299
+ "Glossary"
300
+ ],
301
+ "topic_scores": {
302
+ "Sensitive data": 0.5029798746109009,
303
+ "Glossary": 0.325946182012558,
304
+ "How-to": 0.03934895619750023,
305
+ "Best practices": 0.03307223320007324,
306
+ "Product": 0.0300066489726305,
307
+ "Lineage": 0.020622894167900085,
308
+ "SSO": 0.019720233976840973,
309
+ "Connector": 0.018421567976474762,
310
+ "API/SDK": 0.009881414473056793
311
+ },
312
+ "sentiment": "Angry",
313
+ "priority": "P1"
314
+ }
315
+ },
316
+ {
317
+ "id": "TICKET-258",
318
+ "subject": "Process for onboarding asset owners",
319
+ "body": "We've started identifying owners for our key data assets. What is the recommended workflow in Atlan to assign these owners and automatically notify them? We want to make sure they are aware of their responsibilities without us having to send manual emails for every assignment.",
320
+ "classification": {
321
+ "id": "TICKET-258",
322
+ "topic_tags": [
323
+ "How-to"
324
+ ],
325
+ "topic_scores": {
326
+ "How-to": 0.2538345754146576,
327
+ "Sensitive data": 0.242264062166214,
328
+ "Best practices": 0.15336112678050995,
329
+ "Product": 0.11627519130706787,
330
+ "SSO": 0.0775536373257637,
331
+ "Glossary": 0.055225685238838196,
332
+ "Connector": 0.043340008705854416,
333
+ "API/SDK": 0.03405303508043289,
334
+ "Lineage": 0.02409267984330654
335
+ },
336
+ "sentiment": "Positive",
337
+ "priority": "P2"
338
+ }
339
+ },
340
+ {
341
+ "id": "TICKET-259",
342
+ "subject": "How does Atlan surface sensitive fields like PII?",
343
+ "body": "Our security team is evaluating Atlan and their main question is around PII and sensitive data. How does Atlan automatically identify fields containing PII? What are our options to apply tags or masks to these fields once they are identified to prevent unauthorized access?",
344
+ "classification": {
345
+ "id": "TICKET-259",
346
+ "topic_tags": [
347
+ "Sensitive data"
348
+ ],
349
+ "topic_scores": {
350
+ "Sensitive data": 0.8192521333694458,
351
+ "How-to": 0.09408269077539444,
352
+ "Product": 0.022458547726273537,
353
+ "SSO": 0.01460132747888565,
354
+ "Best practices": 0.013057924807071686,
355
+ "Connector": 0.011657687835395336,
356
+ "Glossary": 0.008879464119672775,
357
+ "Lineage": 0.008671694435179234,
358
+ "API/SDK": 0.0073384749703109264
359
+ },
360
+ "sentiment": "Angry",
361
+ "priority": "P2"
362
+ }
363
+ },
364
+ {
365
+ "id": "TICKET-260",
366
+ "subject": "Authentication methods for APIs and SDKs",
367
+ "body": "We are planning to build several automations using the Atlan API and Python SDK. What authentication methods are supported? Is it just API keys, or can we use something like OAuth? We have a strict policy that requires key rotation every 90 days, so we need to understand how to manage this programmatically.",
368
+ "classification": {
369
+ "id": "TICKET-260",
370
+ "topic_tags": [
371
+ "API/SDK"
372
+ ],
373
+ "topic_scores": {
374
+ "API/SDK": 0.876763105392456,
375
+ "Product": 0.029590053483843803,
376
+ "How-to": 0.02613384835422039,
377
+ "Sensitive data": 0.022094866260886192,
378
+ "Best practices": 0.01735789142549038,
379
+ "Glossary": 0.012602042406797409,
380
+ "Connector": 0.005761501379311085,
381
+ "SSO": 0.005750404670834541,
382
+ "Lineage": 0.003946291748434305
383
+ },
384
+ "sentiment": "Angry",
385
+ "priority": "P1"
386
+ }
387
+ },
388
+ {
389
+ "id": "TICKET-261",
390
+ "subject": "Enabling and testing SAML SSO",
391
+ "body": "We are ready to enable SAML SSO with our Okta instance. However, we are very concerned about disrupting our active users if the configuration is wrong. Is there a way to test the SSO configuration for a specific user or group before we enable it for the entire workspace?",
392
+ "classification": {
393
+ "id": "TICKET-261",
394
+ "topic_tags": [
395
+ "SSO"
396
+ ],
397
+ "topic_scores": {
398
+ "SSO": 0.8798295855522156,
399
+ "Sensitive data": 0.024509701877832413,
400
+ "Best practices": 0.023383136838674545,
401
+ "Product": 0.02332100085914135,
402
+ "How-to": 0.015699787065386772,
403
+ "Glossary": 0.010842178016901016,
404
+ "Connector": 0.008763168007135391,
405
+ "Lineage": 0.006830730475485325,
406
+ "API/SDK": 0.006820705719292164
407
+ },
408
+ "sentiment": "Angry",
409
+ "priority": "P2"
410
+ }
411
+ },
412
+ {
413
+ "id": "TICKET-262",
414
+ "subject": "SSO login not assigning user to correct group",
415
+ "body": "I've just had a new user, 'test.user@company.com', log in via our newly configured SSO. They were authenticated successfully, but they were not added to the 'Data Analysts' group as expected based on our SAML assertions. This is preventing them from accessing any assets. What could be the reason for this mis-assignment?",
416
+ "classification": {
417
+ "id": "TICKET-262",
418
+ "topic_tags": [
419
+ "SSO"
420
+ ],
421
+ "topic_scores": {
422
+ "SSO": 0.857302725315094,
423
+ "Sensitive data": 0.049394093453884125,
424
+ "Product": 0.021338628605008125,
425
+ "Best practices": 0.01603107526898384,
426
+ "Connector": 0.015189331956207752,
427
+ "Lineage": 0.011370034888386726,
428
+ "How-to": 0.010868269950151443,
429
+ "Glossary": 0.01049866247922182,
430
+ "API/SDK": 0.008007260970771313
431
+ },
432
+ "sentiment": "Angry",
433
+ "priority": "P2"
434
+ }
435
+ },
436
+ {
437
+ "id": "TICKET-263",
438
+ "subject": "Integration with existing DLP or secrets manager",
439
+ "body": "Does Atlan have the capability to integrate with third-party tools like a DLP (Data Loss Prevention) solution or a secrets manager like HashiCorp Vault? We need to ensure that connection credentials and sensitive metadata classifications are handled by our central security systems.",
440
+ "classification": {
441
+ "id": "TICKET-263",
442
+ "topic_tags": [
443
+ "Sensitive data"
444
+ ],
445
+ "topic_scores": {
446
+ "Sensitive data": 0.42641857266426086,
447
+ "Product": 0.1320459097623825,
448
+ "Connector": 0.09498800337314606,
449
+ "Best practices": 0.08815968036651611,
450
+ "How-to": 0.08476880192756653,
451
+ "SSO": 0.055265337228775024,
452
+ "Lineage": 0.05023786425590515,
453
+ "Glossary": 0.03815902769565582,
454
+ "API/SDK": 0.029956845566630363
455
+ },
456
+ "sentiment": "Angry",
457
+ "priority": "P1"
458
+ }
459
+ },
460
+ {
461
+ "id": "TICKET-264",
462
+ "subject": "Accessing audit logs for compliance reviews",
463
+ "body": "Our compliance team needs to perform a quarterly review of all activities within Atlan. They need to know who accessed what data, who made permission changes, etc. Where can we find these audit logs, and is there a way to export them or pull them via an API for our records?",
464
+ "classification": {
465
+ "id": "TICKET-264",
466
+ "topic_tags": [
467
+ "How-to"
468
+ ],
469
+ "topic_scores": {
470
+ "How-to": 0.3180291950702667,
471
+ "Sensitive data": 0.15853844583034515,
472
+ "Best practices": 0.10388915240764618,
473
+ "Glossary": 0.1034572422504425,
474
+ "Product": 0.08544151484966278,
475
+ "Connector": 0.07629270106554031,
476
+ "API/SDK": 0.07074188441038132,
477
+ "SSO": 0.04728706181049347,
478
+ "Lineage": 0.03632282093167305
479
+ },
480
+ "sentiment": "Angry",
481
+ "priority": "P1"
482
+ }
483
+ },
484
+ {
485
+ "id": "TICKET-265",
486
+ "subject": "How to programmatically create an asset using the REST API?",
487
+ "body": "I'm trying to create a new custom asset (a 'Report') using the REST API, but my requests keep failing with a 400 error. The API documentation is a bit sparse on the required payload structure for creating new entities. Could you provide a basic cURL or Python `requests` example of what a successful request body should look like?",
488
+ "classification": {
489
+ "id": "TICKET-265",
490
+ "topic_tags": [
491
+ "How-to"
492
+ ],
493
+ "topic_scores": {
494
+ "How-to": 0.7019529938697815,
495
+ "Product": 0.07501153647899628,
496
+ "Sensitive data": 0.05422753840684891,
497
+ "API/SDK": 0.04047072306275368,
498
+ "Best practices": 0.03833013400435448,
499
+ "Glossary": 0.029218755662441254,
500
+ "Connector": 0.022240931168198586,
501
+ "SSO": 0.02166176587343216,
502
+ "Lineage": 0.01688558980822563
503
+ },
504
+ "sentiment": "Angry",
505
+ "priority": "P1"
506
+ }
507
+ },
508
+ {
509
+ "id": "TICKET-266",
510
+ "subject": "SDK availability and Python example",
511
+ "body": "I'm a data engineer and prefer using SDKs over raw API calls. Which languages do you provide SDKs for? I'm particularly interested in Python. Where can I find the installation instructions (e.g., PyPI package name) and a short code snippet for a common task, like creating a new glossary term?",
512
+ "classification": {
513
+ "id": "TICKET-266",
514
+ "topic_tags": [
515
+ "API/SDK"
516
+ ],
517
+ "topic_scores": {
518
+ "API/SDK": 0.5466265678405762,
519
+ "Glossary": 0.11553136259317398,
520
+ "How-to": 0.08963733166456223,
521
+ "Product": 0.06004762277007103,
522
+ "Sensitive data": 0.05992572009563446,
523
+ "Best practices": 0.056482475250959396,
524
+ "SSO": 0.032319601625204086,
525
+ "Connector": 0.020940439775586128,
526
+ "Lineage": 0.018488900735974312
527
+ },
528
+ "sentiment": "Angry",
529
+ "priority": "P2"
530
+ }
531
+ },
532
+ {
533
+ "id": "TICKET-267",
534
+ "subject": "How do webhooks work in Atlan?",
535
+ "body": "I'm exploring using webhooks to send real-time notifications from Atlan to our internal Slack channel. I need to understand what types of events (e.g., asset updated, term created) can trigger a webhook. Also, how do we validate that the incoming payloads are genuinely from Atlan? Do you support payload signing?",
536
+ "classification": {
537
+ "id": "TICKET-267",
538
+ "topic_tags": [
539
+ "How-to"
540
+ ],
541
+ "topic_scores": {
542
+ "How-to": 0.6155452728271484,
543
+ "Product": 0.0822950229048729,
544
+ "Sensitive data": 0.07453867048025131,
545
+ "Connector": 0.04730972647666931,
546
+ "SSO": 0.04478367045521736,
547
+ "Best practices": 0.04298403859138489,
548
+ "Glossary": 0.04120447859168053,
549
+ "API/SDK": 0.03110373578965664,
550
+ "Lineage": 0.0202353373169899
551
+ },
552
+ "sentiment": "Angry",
553
+ "priority": "P1"
554
+ }
555
+ },
556
+ {
557
+ "id": "TICKET-268",
558
+ "subject": "Triggering an AWS Lambda from Atlan events",
559
+ "body": "We have a workflow where we want to trigger a custom AWS Lambda function whenever a specific Atlan tag (e.g., 'PII-Confirmed') is added to an asset. What is the recommended and most secure way to set this up? Should we use webhooks pointing to an API Gateway, or is there a more direct integration?",
560
+ "classification": {
561
+ "id": "TICKET-268",
562
+ "topic_tags": [
563
+ "Sensitive data"
564
+ ],
565
+ "topic_scores": {
566
+ "Sensitive data": 0.6416335701942444,
567
+ "How-to": 0.07063692808151245,
568
+ "Product": 0.06787507981061935,
569
+ "Best practices": 0.05057653412222862,
570
+ "Connector": 0.04566250368952751,
571
+ "SSO": 0.04065138101577759,
572
+ "Glossary": 0.034141793847084045,
573
+ "API/SDK": 0.0318898931145668,
574
+ "Lineage": 0.016932277008891106
575
+ },
576
+ "sentiment": "Angry",
577
+ "priority": "P2"
578
+ }
579
+ },
580
+ {
581
+ "id": "TICKET-269",
582
+ "subject": "When to use Atlan automations vs. external services?",
583
+ "body": "I see that Atlan has a built-in 'Automations' feature. I'm trying to decide if I should use this to manage a workflow or if I should use an external service like Zapier or our own Airflow instance. Could you provide some guidance or examples on what types of workflows are best suited for the native automations versus an external tool?",
584
+ "classification": {
585
+ "id": "TICKET-269",
586
+ "topic_tags": [
587
+ "How-to"
588
+ ],
589
+ "topic_scores": {
590
+ "How-to": 0.31247150897979736,
591
+ "Product": 0.1856827437877655,
592
+ "Best practices": 0.12680679559707642,
593
+ "Sensitive data": 0.11848830431699753,
594
+ "Glossary": 0.0666823759675026,
595
+ "SSO": 0.059052322059869766,
596
+ "API/SDK": 0.05207480862736702,
597
+ "Connector": 0.0423959419131279,
598
+ "Lineage": 0.03634531795978546
599
+ },
600
+ "sentiment": "Angry",
601
+ "priority": "P2"
602
+ }
603
+ },
604
+ {
605
+ "id": "TICKET-270",
606
+ "subject": "Connector failed to crawl - where to check logs?",
607
+ "body": "URGENT: Our nightly Snowflake crawler failed last night and no new metadata was ingested. This is a critical failure as our morning reports are now missing lineage information. Where can I find the detailed error logs for the crawler run to understand what went wrong? I need to fix this ASAP.",
608
+ "classification": {
609
+ "id": "TICKET-270",
610
+ "topic_tags": [
611
+ "Connector"
612
+ ],
613
+ "topic_scores": {
614
+ "Connector": 0.7956897616386414,
615
+ "Lineage": 0.10316770523786545,
616
+ "Sensitive data": 0.036917030811309814,
617
+ "Product": 0.021942023187875748,
618
+ "SSO": 0.012663195841014385,
619
+ "How-to": 0.009871531277894974,
620
+ "Glossary": 0.008735943585634232,
621
+ "Best practices": 0.005630514584481716,
622
+ "API/SDK": 0.005382323171943426
623
+ },
624
+ "sentiment": "Angry",
625
+ "priority": "P0"
626
+ }
627
+ },
628
+ {
629
+ "id": "TICKET-271",
630
+ "subject": "Asset extracted but not published to Atlan",
631
+ "body": "This is very strange. I'm looking at the crawler logs, and I can see that the asset 'schema.my_table' was successfully extracted from the source. However, when I search for this table in the Atlan UI, it doesn't appear. It seems like it's getting stuck somewhere between extraction and publishing. Can you please investigate the root cause?",
632
+ "classification": {
633
+ "id": "TICKET-271",
634
+ "topic_tags": [
635
+ "Product"
636
+ ],
637
+ "topic_scores": {
638
+ "Product": 0.3031204640865326,
639
+ "Sensitive data": 0.1932554543018341,
640
+ "Best practices": 0.1285562515258789,
641
+ "SSO": 0.08312994241714478,
642
+ "Connector": 0.07090307772159576,
643
+ "How-to": 0.0644526332616806,
644
+ "Lineage": 0.06026022136211395,
645
+ "API/SDK": 0.05608906224370003,
646
+ "Glossary": 0.04023294523358345
647
+ },
648
+ "sentiment": "Angry",
649
+ "priority": "P2"
650
+ }
651
+ },
652
+ {
653
+ "id": "TICKET-272",
654
+ "subject": "How to measure adoption and generate reports?",
655
+ "body": "My manager is asking for metrics on our Atlan usage to justify the investment. I need to generate a report showing things like the number of active users, most frequently queried tables, and the number of assets with assigned owners. Does Atlan have a reporting or dashboarding feature for this?",
656
+ "classification": {
657
+ "id": "TICKET-272",
658
+ "topic_tags": [
659
+ "How-to"
660
+ ],
661
+ "topic_scores": {
662
+ "How-to": 0.7723440527915955,
663
+ "Product": 0.057156000286340714,
664
+ "Sensitive data": 0.04286324605345726,
665
+ "Best practices": 0.03300917148590088,
666
+ "Connector": 0.02577182836830616,
667
+ "Glossary": 0.020914847031235695,
668
+ "SSO": 0.019454676657915115,
669
+ "API/SDK": 0.014868470840156078,
670
+ "Lineage": 0.013617790304124355
671
+ },
672
+ "sentiment": "Angry",
673
+ "priority": "P1"
674
+ }
675
+ },
676
+ {
677
+ "id": "TICKET-273",
678
+ "subject": "Best practices for catalog hygiene",
679
+ "body": "We've been using Atlan for six months, and our catalog is already starting to get a bit messy with duplicate assets and stale metadata from old tests. As we roll this out to more teams, what are some common best practices or features within Atlan that can help us maintain good catalog hygiene and prevent this problem from getting worse?",
680
+ "classification": {
681
+ "id": "TICKET-273",
682
+ "topic_tags": [
683
+ "Best practices"
684
+ ],
685
+ "topic_scores": {
686
+ "Best practices": 0.8946393728256226,
687
+ "Product": 0.03143538162112236,
688
+ "How-to": 0.0247127003967762,
689
+ "Sensitive data": 0.014560189098119736,
690
+ "Glossary": 0.009846128523349762,
691
+ "Connector": 0.007913430221378803,
692
+ "Lineage": 0.007438138592988253,
693
+ "SSO": 0.006103003863245249,
694
+ "API/SDK": 0.0033516811672598124
695
+ },
696
+ "sentiment": "Angry",
697
+ "priority": "P2"
698
+ }
699
+ },
700
+ {
701
+ "id": "TICKET-274",
702
+ "subject": "How to scale Atlan across multiple business units?",
703
+ "body": "We are planning a global rollout of Atlan to multiple business units, each with its own data sources and governance teams. We're looking for advice on the best way to structure our Atlan instance. Should we use separate workspaces, or can we achieve isolation using teams and permissions within a single workspace while maintaining a consistent governance model?",
704
+ "classification": {
705
+ "id": "TICKET-274",
706
+ "topic_tags": [
707
+ "How-to"
708
+ ],
709
+ "topic_scores": {
710
+ "How-to": 0.5874260067939758,
711
+ "Best practices": 0.10692328214645386,
712
+ "Product": 0.09862891584634781,
713
+ "Sensitive data": 0.0611591637134552,
714
+ "Glossary": 0.038298100233078,
715
+ "SSO": 0.037935610860586166,
716
+ "Connector": 0.029464809224009514,
717
+ "Lineage": 0.023105787113308907,
718
+ "API/SDK": 0.017058314755558968
719
+ },
720
+ "sentiment": "Angry",
721
+ "priority": "P2"
722
+ }
723
+ }
724
+ ]
docs_corpus.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
docs_meta.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
faiss_index.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb0b698008aae120d30504c1784f0797dc800066b27d63740c079b2466549d8c
3
+ size 18026541
sample_tickets.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "TICKET-245",
4
+ "subject": "Connecting Snowflake to Atlan - required permissions?",
5
+ "body": "Hi team, we're trying to set up our primary Snowflake production database as a new source in Atlan, but the connection keeps failing. We've tried using our standard service account, but it's not working. Our entire BI team is blocked on this integration for a major upcoming project, so it's quite urgent. Could you please provide a definitive list of the exact permissions and credentials needed on the Snowflake side to get this working? Thanks."
6
+ },
7
+ {
8
+ "id": "TICKET-246",
9
+ "subject": "Which connectors automatically capture lineage?",
10
+ "body": "Hello, I'm new to Atlan and trying to understand the lineage capabilities. The documentation mentions automatic lineage, but it's not clear which of our connectors (we use Fivetran, dbt, and Tableau) support this out-of-the-box. We need to present a clear picture of our data flow to leadership next week. Can you explain how lineage capture differs for these tools?"
11
+ },
12
+ {
13
+ "id": "TICKET-247",
14
+ "subject": "Deployment of Atlan agent for private data lake",
15
+ "body": "Our primary data lake is hosted on-premise within a secure VPC and is not exposed to the internet. We understand we need to use the Atlan agent for this, but the setup instructions are a bit confusing for our security team. This is a critical source for us, and we can't proceed with our rollout until we get this connected. Can you provide a detailed deployment guide or connect us with a technical expert?"
16
+ },
17
+ {
18
+ "id": "TICKET-248",
19
+ "subject": "How to surface sample rows and schema changes?",
20
+ "body": "Hi, we've successfully connected our Redshift cluster, and the assets are showing up. However, my data analysts are asking how they can see sample data or recent schema changes directly within Atlan without having to go back to Redshift. Is this feature available? I feel like I'm missing something obvious."
21
+ },
22
+ {
23
+ "id": "TICKET-249",
24
+ "subject": "Exporting lineage view for a specific table",
25
+ "body": "For our quarterly audit, I need to provide a complete upstream and downstream lineage diagram for our core `fact_orders` table. I can see the lineage perfectly in the UI, but I can't find an option to export this view as an image or PDF. This is a hard requirement from our compliance team and the deadline is approaching fast. Please help!"
26
+ },
27
+ {
28
+ "id": "TICKET-250",
29
+ "subject": "Importing lineage from Airflow jobs",
30
+ "body": "We run hundreds of ETL jobs in Airflow, and we need to see that lineage reflected in Atlan. I've read that Atlan can integrate with Airflow, but how do we configure it to correctly map our DAGs to the specific datasets they are transforming? The current documentation is a bit high-level."
31
+ },
32
+ {
33
+ "id": "TICKET-251",
34
+ "subject": "Using the Visual Query Builder",
35
+ "body": "I'm a business analyst and not very comfortable with writing complex SQL. I was excited to see the Visual Query Builder in Atlan, but I'm having trouble figuring out how to join multiple tables and save my query for later use. Is there a tutorial or a quick guide you can point me to?"
36
+ },
37
+ {
38
+ "id": "TICKET-252",
39
+ "subject": "Programmatic extraction of lineage",
40
+ "body": "Our internal data science team wants to build a custom application that analyzes metadata propagation delays. To do this, we need to programmatically extract lineage data from Atlan via an API. Does the API expose lineage information, and if so, could you provide an example of the endpoint and the structure of the response?"
41
+ },
42
+ {
43
+ "id": "TICKET-253",
44
+ "subject": "Upstream lineage to Snowflake view not working",
45
+ "body": "This is infuriating. We have a critical Snowflake view, `finance.daily_revenue`, that is built from three upstream tables. Atlan is correctly showing the downstream dependencies, but the upstream lineage is completely missing. This makes the view untrustworthy for our analysts. We've re-run the crawler multiple times. What could be causing this? This is a huge problem for us."
46
+ },
47
+ {
48
+ "id": "TICKET-254",
49
+ "subject": "How to create a business glossary and link terms in bulk?",
50
+ "body": "We are migrating our existing business glossary from a spreadsheet into Atlan. We have over 500 terms. Manually creating each one and linking them to thousands of assets seems impossible. Is there a bulk import feature using CSV or an API to create terms and link them to assets? This is blocking our entire governance initiative."
51
+ },
52
+ {
53
+ "id": "TICKET-255",
54
+ "subject": "Creating a custom role for data stewards",
55
+ "body": "I'm trying to set up a custom role for our data stewards. They need permission to edit descriptions and link glossary terms, but they should NOT have permission to run queries or change connection settings. I'm looking at the default roles, but none of them fit perfectly. How can I create a new role with this specific set of permissions?"
56
+ },
57
+ {
58
+ "id": "TICKET-256",
59
+ "subject": "Mapping Active Directory groups to Atlan teams",
60
+ "body": "Our company policy requires us to manage all user access through Active Directory groups. We need to map our existing AD groups (e.g., 'data-analyst-finance', 'data-engineer-core') to teams within Atlan to automatically grant the correct permissions. I can't find the settings for this. How is this configured?"
61
+ },
62
+ {
63
+ "id": "TICKET-257",
64
+ "subject": "RBAC for assets vs. glossaries",
65
+ "body": "I need clarification on how Atlan's role-based access control works. If a user is denied access to a specific Snowflake schema, can they still see the glossary terms that are linked to the tables in that schema? I need to ensure our PII governance is airtight."
66
+ },
67
+ {
68
+ "id": "TICKET-258",
69
+ "subject": "Process for onboarding asset owners",
70
+ "body": "We've started identifying owners for our key data assets. What is the recommended workflow in Atlan to assign these owners and automatically notify them? We want to make sure they are aware of their responsibilities without us having to send manual emails for every assignment."
71
+ },
72
+ {
73
+ "id": "TICKET-259",
74
+ "subject": "How does Atlan surface sensitive fields like PII?",
75
+ "body": "Our security team is evaluating Atlan and their main question is around PII and sensitive data. How does Atlan automatically identify fields containing PII? What are our options to apply tags or masks to these fields once they are identified to prevent unauthorized access?"
76
+ },
77
+ {
78
+ "id": "TICKET-260",
79
+ "subject": "Authentication methods for APIs and SDKs",
80
+ "body": "We are planning to build several automations using the Atlan API and Python SDK. What authentication methods are supported? Is it just API keys, or can we use something like OAuth? We have a strict policy that requires key rotation every 90 days, so we need to understand how to manage this programmatically."
81
+ },
82
+ {
83
+ "id": "TICKET-261",
84
+ "subject": "Enabling and testing SAML SSO",
85
+ "body": "We are ready to enable SAML SSO with our Okta instance. However, we are very concerned about disrupting our active users if the configuration is wrong. Is there a way to test the SSO configuration for a specific user or group before we enable it for the entire workspace?"
86
+ },
87
+ {
88
+ "id": "TICKET-262",
89
+ "subject": "SSO login not assigning user to correct group",
90
+ "body": "I've just had a new user, 'test.user@company.com', log in via our newly configured SSO. They were authenticated successfully, but they were not added to the 'Data Analysts' group as expected based on our SAML assertions. This is preventing them from accessing any assets. What could be the reason for this mis-assignment?"
91
+ },
92
+ {
93
+ "id": "TICKET-263",
94
+ "subject": "Integration with existing DLP or secrets manager",
95
+ "body": "Does Atlan have the capability to integrate with third-party tools like a DLP (Data Loss Prevention) solution or a secrets manager like HashiCorp Vault? We need to ensure that connection credentials and sensitive metadata classifications are handled by our central security systems."
96
+ },
97
+ {
98
+ "id": "TICKET-264",
99
+ "subject": "Accessing audit logs for compliance reviews",
100
+ "body": "Our compliance team needs to perform a quarterly review of all activities within Atlan. They need to know who accessed what data, who made permission changes, etc. Where can we find these audit logs, and is there a way to export them or pull them via an API for our records?"
101
+ },
102
+ {
103
+ "id": "TICKET-265",
104
+ "subject": "How to programmatically create an asset using the REST API?",
105
+ "body": "I'm trying to create a new custom asset (a 'Report') using the REST API, but my requests keep failing with a 400 error. The API documentation is a bit sparse on the required payload structure for creating new entities. Could you provide a basic cURL or Python `requests` example of what a successful request body should look like?"
106
+ },
107
+ {
108
+ "id": "TICKET-266",
109
+ "subject": "SDK availability and Python example",
110
+ "body": "I'm a data engineer and prefer using SDKs over raw API calls. Which languages do you provide SDKs for? I'm particularly interested in Python. Where can I find the installation instructions (e.g., PyPI package name) and a short code snippet for a common task, like creating a new glossary term?"
111
+ },
112
+ {
113
+ "id": "TICKET-267",
114
+ "subject": "How do webhooks work in Atlan?",
115
+ "body": "I'm exploring using webhooks to send real-time notifications from Atlan to our internal Slack channel. I need to understand what types of events (e.g., asset updated, term created) can trigger a webhook. Also, how do we validate that the incoming payloads are genuinely from Atlan? Do you support payload signing?"
116
+ },
117
+ {
118
+ "id": "TICKET-268",
119
+ "subject": "Triggering an AWS Lambda from Atlan events",
120
+ "body": "We have a workflow where we want to trigger a custom AWS Lambda function whenever a specific Atlan tag (e.g., 'PII-Confirmed') is added to an asset. What is the recommended and most secure way to set this up? Should we use webhooks pointing to an API Gateway, or is there a more direct integration?"
121
+ },
122
+ {
123
+ "id": "TICKET-269",
124
+ "subject": "When to use Atlan automations vs. external services?",
125
+ "body": "I see that Atlan has a built-in 'Automations' feature. I'm trying to decide if I should use this to manage a workflow or if I should use an external service like Zapier or our own Airflow instance. Could you provide some guidance or examples on what types of workflows are best suited for the native automations versus an external tool?"
126
+ },
127
+ {
128
+ "id": "TICKET-270",
129
+ "subject": "Connector failed to crawl - where to check logs?",
130
+ "body": "URGENT: Our nightly Snowflake crawler failed last night and no new metadata was ingested. This is a critical failure as our morning reports are now missing lineage information. Where can I find the detailed error logs for the crawler run to understand what went wrong? I need to fix this ASAP."
131
+ },
132
+ {
133
+ "id": "TICKET-271",
134
+ "subject": "Asset extracted but not published to Atlan",
135
+ "body": "This is very strange. I'm looking at the crawler logs, and I can see that the asset 'schema.my_table' was successfully extracted from the source. However, when I search for this table in the Atlan UI, it doesn't appear. It seems like it's getting stuck somewhere between extraction and publishing. Can you please investigate the root cause?"
136
+ },
137
+ {
138
+ "id": "TICKET-272",
139
+ "subject": "How to measure adoption and generate reports?",
140
+ "body": "My manager is asking for metrics on our Atlan usage to justify the investment. I need to generate a report showing things like the number of active users, most frequently queried tables, and the number of assets with assigned owners. Does Atlan have a reporting or dashboarding feature for this?"
141
+ },
142
+ {
143
+ "id": "TICKET-273",
144
+ "subject": "Best practices for catalog hygiene",
145
+ "body": "We've been using Atlan for six months, and our catalog is already starting to get a bit messy with duplicate assets and stale metadata from old tests. As we roll this out to more teams, what are some common best practices or features within Atlan that can help us maintain good catalog hygiene and prevent this problem from getting worse?"
146
+ },
147
+ {
148
+ "id": "TICKET-274",
149
+ "subject": "How to scale Atlan across multiple business units?",
150
+ "body": "We are planning a global rollout of Atlan to multiple business units, each with its own data sources and governance teams. We're looking for advice on the best way to structure our Atlan instance. Should we use separate workspaces, or can we achieve isolation using teams and permissions within a single workspace while maintaining a consistent governance model?"
151
+ }
152
+ ]
src/__pycache__/classifier.cpython-313.pyc ADDED
Binary file (6.34 kB). View file
 
src/__pycache__/data_loader.cpython-313.pyc ADDED
Binary file (1.15 kB). View file
 
src/__pycache__/rag.cpython-313.pyc ADDED
Binary file (20.1 kB). View file
 
src/app.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/app.py
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import json
5
+ from pathlib import Path
6
+ from data_loader import load_tickets
7
+ from classifier import classify_ticket, classify_all_and_save
8
+
9
+ # NEW: import RAG handler
10
+ try:
11
+ from rag import handle_rag_query
12
+ except Exception:
13
+ handle_rag_query = None # we'll handle absence gracefully
14
+
15
+ # Config
16
+ st.set_page_config(page_title="Atlan - Support Copilot (Phase 3)", layout="wide")
17
+ ROOT = Path(__file__).parent.parent.resolve() # project root
18
+ CLASSIFIED_PATH = ROOT.joinpath("classified_tickets_phase2.json")
19
+
20
+ st.title("Atlan — Support Copilot (Phase 3)")
21
+ st.markdown(
22
+ "**Phase 3:** Zero-shot topic classification + HF sentiment + rule-based priority + RAG (retrieval-augmented generation). "
23
+ "This demo shows bulk classification and an interactive agent with RAG."
24
+ )
25
+
26
+ # Sidebar controls
27
+ st.sidebar.header("Controls")
28
+ use_saved = st.sidebar.checkbox("Load pre-saved classified file (if available)", value=True)
29
+ run_classify_all = st.sidebar.button("Classify ALL tickets & Save (Phase 2)")
30
+ reload_ui = st.sidebar.button("Reload UI")
31
+
32
+ # NEW: RAG options in sidebar
33
+ st.sidebar.markdown("### RAG options")
34
+ use_openai = st.sidebar.checkbox("Use OpenAI for generation (if API key set)", value=False)
35
+ top_k = st.sidebar.slider("RAG: number of passages to retrieve", min_value=1, max_value=10, value=5)
36
+
37
+ # Safe reload: attempt API call, otherwise show instruction to refresh
38
+ if reload_ui:
39
+ try:
40
+ st.experimental_rerun()
41
+ except Exception:
42
+ st.info("Automatic reload isn't supported by this Streamlit version. Please refresh the browser page to reload the UI.")
43
+
44
+ # Load tickets (original) — call the loader without forcing a path so it uses its default
45
+ try:
46
+ tickets = load_tickets() # loader default = ../sample_tickets.json (project root)
47
+ except Exception as e:
48
+ st.error("Could not load sample tickets. Ensure sample_tickets.json exists at the project root (one level above src/).")
49
+ st.exception(e)
50
+ tickets = []
51
+
52
+ # If user asked to classify all, run classification and save (uses classifier defaults)
53
+ if run_classify_all:
54
+ with st.spinner("Running classification on all tickets (models may load on first run)..."):
55
+ try:
56
+ out_path = classify_all_and_save() # defaults -> saves to ../classified_tickets_phase2.json
57
+ st.success(f"Classified and saved to: {out_path}")
58
+ except Exception as e:
59
+ st.error("Error during batch classification. See details below.")
60
+ st.exception(e)
61
+
62
+ # Try to load pre-saved classified file (if requested and exists)
63
+ classified_data = None
64
+ if use_saved and CLASSIFIED_PATH.exists():
65
+ try:
66
+ classified_data = json.loads(CLASSIFIED_PATH.read_text(encoding="utf-8"))
67
+ except Exception as e:
68
+ st.warning("Could not read the saved classified file; falling back to live classification.")
69
+ st.exception(e)
70
+
71
+ tab1, tab2 = st.tabs(["Bulk Classification Dashboard", "Interactive Agent (demo + RAG)"])
72
+
73
+ with tab1:
74
+ st.header("Bulk ticket classification")
75
+ st.write("This view shows all tickets with their inferred topic tags, sentiment, and priority.")
76
+
77
+ rows = []
78
+ # If we have pre-classified data, use that (faster). Otherwise classify on the fly.
79
+ if classified_data:
80
+ for entry in classified_data:
81
+ c = entry.get("classification", {})
82
+ rows.append({
83
+ "id": entry.get("id"),
84
+ "subject": entry.get("subject"),
85
+ "topic_tags": ", ".join(c.get("topic_tags", [])),
86
+ "sentiment": c.get("sentiment", ""),
87
+ "priority": c.get("priority", ""),
88
+ })
89
+ else:
90
+ # Live classify (will call HF pipelines lazily)
91
+ with st.spinner("Classifying tickets (zero-shot)... this may take a few seconds on first run"):
92
+ for t in tickets:
93
+ try:
94
+ c = classify_ticket(t)
95
+ except Exception as e:
96
+ st.error(f"Error classifying ticket {t.get('id')}: {e}")
97
+ c = {"topic_tags": [], "sentiment": "Error", "priority": "Error"}
98
+ rows.append({
99
+ "id": t.get("id"),
100
+ "subject": t.get("subject"),
101
+ "topic_tags": ", ".join(c.get("topic_tags", [])),
102
+ "sentiment": c.get("sentiment", ""),
103
+ "priority": c.get("priority", ""),
104
+ })
105
+
106
+ df = pd.DataFrame(rows)
107
+ # basic filters
108
+ cols = st.columns([2, 1, 1, 1])
109
+ with cols[0]:
110
+ q = st.text_input("Filter by subject/text contains")
111
+ with cols[1]:
112
+ sel_topic = st.selectbox("Filter by topic (contains)", options=["(any)"] + sorted({t for row in rows for t in row["topic_tags"].split(", ") if t}))
113
+ with cols[2]:
114
+ sel_sent = st.selectbox("Filter by sentiment", options=["(any)","Angry","Frustrated","Neutral","Curious","Positive"])
115
+ with cols[3]:
116
+ sel_prio = st.selectbox("Filter by priority", options=["(any)","P0","P1","P2"])
117
+
118
+ df_display = df.copy()
119
+ if q:
120
+ df_display = df_display[df_display["subject"].str.contains(q, case=False, na=False) | df_display["topic_tags"].str.contains(q, case=False, na=False)]
121
+ if sel_topic and sel_topic != "(any)":
122
+ df_display = df_display[df_display["topic_tags"].str.contains(sel_topic, na=False)]
123
+ if sel_sent and sel_sent != "(any)":
124
+ df_display = df_display[df_display["sentiment"] == sel_sent]
125
+ if sel_prio and sel_prio != "(any)":
126
+ df_display = df_display[df_display["priority"] == sel_prio]
127
+
128
+ st.dataframe(df_display.reset_index(drop=True), use_container_width=True, height=420)
129
+
130
+ st.markdown("### Sample ticket detail")
131
+ # choose ticket
132
+ ids = df_display["id"].tolist()
133
+ if ids:
134
+ sel = st.selectbox("Select ticket", ids)
135
+ # find original ticket object (from classified_data if present else from tickets)
136
+ selected_full = None
137
+ if classified_data:
138
+ selected_full = next((x for x in classified_data if x["id"] == sel), None)
139
+ if not selected_full:
140
+ selected_full = next((x for x in tickets if x["id"] == sel), None)
141
+
142
+ st.write(selected_full)
143
+ st.markdown("**Classification (raw)**")
144
+ if selected_full and "classification" in selected_full:
145
+ st.json(selected_full["classification"])
146
+ else:
147
+ # classify on-the-fly for selected ticket if no classification exists
148
+ with st.spinner("Classifying selected ticket..."):
149
+ try:
150
+ c = classify_ticket(selected_full)
151
+ except Exception as e:
152
+ st.error("Error during classification of selected ticket.")
153
+ st.exception(e)
154
+ c = {}
155
+ st.json(c)
156
+ else:
157
+ st.info("No tickets to display with current filters.")
158
+
159
+ with tab2:
160
+ st.header("Interactive Agent (Phase 3 - analysis + RAG)")
161
+ st.markdown(
162
+ "Paste a ticket subject and body (or type). The backend analysis will show topic tags, sentiment and priority. "
163
+ "If the topic is one of the RAG-enabled categories (How-to, Product, Best practices, API/SDK, SSO), the app will run RAG and show a cited answer."
164
+ )
165
+
166
+ user_input = st.text_area("Paste a ticket subject + body (or type a new one)", height=220, placeholder="Subject line on first line, body below...")
167
+ analyze = st.button("Analyze input")
168
+
169
+ if analyze:
170
+ if not user_input.strip():
171
+ st.warning("Enter some ticket text to analyze.")
172
+ else:
173
+ # Infer subject/body: first line = subject
174
+ lines = user_input.strip().split("\n")
175
+ subject = lines[0]
176
+ body = "\n".join(lines[1:]).strip() if len(lines) > 1 else user_input.strip()
177
+ demo_ticket = {"id": "TEMP", "subject": subject, "body": body}
178
+ with st.spinner("Analyzing (zero-shot + sentiment)..."):
179
+ try:
180
+ c = classify_ticket(demo_ticket)
181
+ except Exception as e:
182
+ st.error("Error during classification.")
183
+ st.exception(e)
184
+ c = {"topic_tags": [], "sentiment": "Error", "priority": "Error"}
185
+
186
+ st.subheader("Internal analysis (backend view)")
187
+ st.json(c)
188
+
189
+ st.subheader("Final response (frontend view)")
190
+ # RAG-enabled topics
191
+ allowed_rag = {"How-to", "Product", "Best practices", "API/SDK", "SSO"}
192
+
193
+ # If ticket topic is RAG-enabled -> run RAG
194
+ if any(lbl in allowed_rag for lbl in c.get("topic_tags", [])):
195
+ if handle_rag_query is None:
196
+ st.error("RAG handler not found. Make sure src/rag.py exists and is importable.")
197
+ else:
198
+ st.info("RAG triggered — retrieving docs and generating an answer...")
199
+ with st.spinner("Retrieving + generating answer (may take a few seconds)..."):
200
+ # Use the combined subject+body as the query
201
+ query_text = f"{subject}\n\n{body}"
202
+ try:
203
+ rag_res = handle_rag_query(query_text, top_k=top_k, use_openai=use_openai)
204
+ except Exception as e:
205
+ st.error("Error during RAG operation.")
206
+ st.exception(e)
207
+ rag_res = {"answer": "RAG failed.", "sources": [], "retrieved": []}
208
+
209
+ st.subheader("Answer")
210
+ st.markdown(rag_res.get("answer", "No answer returned."))
211
+
212
+ st.subheader("Sources (citations)")
213
+ for s in rag_res.get("sources", []):
214
+ st.write(s)
215
+
216
+ st.subheader("Top retrieved passages (debug view)")
217
+ for r in rag_res.get("retrieved", [])[:top_k]:
218
+ st.markdown(f"**Title:** {r.get('title','(no title)')} \n**URL:** {r.get('url')} \n**Score:** {r.get('score'):.4f}")
219
+ st.write(r.get("text","")[:800] + ("..." if len(r.get("text","")) > 800 else ""))
220
+
221
+ else:
222
+ st.success(f"This ticket has been classified as {c.get('topic_tags', [])} and routed to the appropriate team.")
223
+
224
+ st.markdown("---")
225
+ st.caption(
226
+ "Phase 3 demo — zero-shot topic classification (facebook/bart-large-mnli), sentiment (distilbert SST-2), and RAG using local FAISS + sentence-transformers. "
227
+ "Toggle 'Use OpenAI' in the sidebar to use the OpenAI API for generation (requires OPENAI_API_KEY in env)."
228
+ )
src/classifier.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/classifier.py
2
+ from typing import Dict, List, Union
3
+ from transformers import pipeline
4
+ import math
5
+ import json
6
+ from pathlib import Path
7
+
8
+ # Lazy-loaded pipelines (module-level to reuse)
9
+ _zero_shot_clf = None
10
+ _sentiment_clf = None
11
+
12
+ def get_zero_shot_classifier():
13
+ global _zero_shot_clf
14
+ if _zero_shot_clf is None:
15
+ # BART or RoBERTa NLI models are common choices
16
+ _zero_shot_clf = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
17
+ return _zero_shot_clf
18
+
19
+ def get_sentiment_classifier():
20
+ global _sentiment_clf
21
+ if _sentiment_clf is None:
22
+ # SST-2 fine-tuned model
23
+ _sentiment_clf = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
24
+ return _sentiment_clf
25
+
26
+ # Schema - fixed topic labels requested by the assignment
27
+ TOPIC_LABELS = [
28
+ "How-to",
29
+ "Product",
30
+ "Connector",
31
+ "Lineage",
32
+ "API/SDK",
33
+ "SSO",
34
+ "Glossary",
35
+ "Best practices",
36
+ "Sensitive data"
37
+ ]
38
+
39
+ # Optionally add synonyms/prompts to nudge zero-shot
40
+ LABEL_DESCRIPTIONS = {
41
+ "How-to": "user asking how to perform a task or request a tutorial",
42
+ "Product": "product feature, UI or general product question",
43
+ "Connector": "questions about connectors, crawlers, integrations and failures",
44
+ "Lineage": "questions about lineage, upstream/downstream or lineage exports",
45
+ "API/SDK": "developer questions about APIs, SDKs, endpoints, code examples",
46
+ "SSO": "authentication, SAML, SSO, Okta, login issues",
47
+ "Glossary": "business glossary, terms, bulk import of glossary terms",
48
+ "Best practices": "request for recommended approach, best practices or governance",
49
+ "Sensitive data": "questions about PII, masking, DLP, secrets"
50
+ }
51
+
52
+ def classify_topic_zero_shot(text: str, labels: List[str] = TOPIC_LABELS, hypothesis_template: str = "This text is about {}.") -> Dict:
53
+ """
54
+ Returns a dictionary with labels and scores from zero-shot classifier.
55
+ """
56
+ clf = get_zero_shot_classifier()
57
+ # The HF zero-shot pipeline can accept a 'hypothesis_template' to improve results.
58
+ res = clf(sequences=text, candidate_labels=labels, hypothesis_template=hypothesis_template)
59
+ # Example res: {'sequence':..., 'labels': [...], 'scores':[...]}
60
+ # We'll return top N labels above a threshold
61
+ return res
62
+
63
+ def classify_sentiment_hf(text: str) -> str:
64
+ """
65
+ Returns a human-friendly sentiment label, mapping HF outputs to your schema.
66
+ HF model returns POSITIVE/NEGATIVE with a score.
67
+ We'll use a small mapping to Frustrated/Curious/Angry/Neutral/Positive.
68
+ """
69
+ clf = get_sentiment_classifier()
70
+ out = clf(text[:1000]) # truncate long text for speed
71
+ # out like [{'label': 'NEGATIVE', 'score': 0.999}]
72
+ if not out:
73
+ return "Neutral"
74
+ lab = out[0]["label"].upper()
75
+ score = out[0]["score"]
76
+ # simple mapping
77
+ if lab == "NEGATIVE":
78
+ # distinguish angry vs frustrated by strength
79
+ if score > 0.9:
80
+ return "Angry"
81
+ return "Frustrated"
82
+ elif lab == "POSITIVE":
83
+ if score > 0.9:
84
+ return "Positive"
85
+ return "Curious"
86
+ else:
87
+ return "Neutral"
88
+
89
+ # Keep same rule-based priority function (deterministic SLA logic)
90
+ PRIORITY_KEYWORDS_P0 = ["urgent", "asap", "blocked", "blocker", "critical", "production", "failed", "failure", "infuriating", "can't", "cant", "down", "urgent:"]
91
+ PRIORITY_KEYWORDS_P1 = ["need", "important", "deadline", "next week", "approaching", "required", "soon", "high"]
92
+
93
+ def classify_priority(text: str, subject: str = "") -> str:
94
+ t = (subject + " " + text).lower()
95
+ for k in PRIORITY_KEYWORDS_P0:
96
+ if k in t:
97
+ return "P0"
98
+ for k in PRIORITY_KEYWORDS_P1:
99
+ if k in t:
100
+ return "P1"
101
+ return "P2"
102
+
103
+ def classify_ticket(ticket: Dict, top_k: int = 2, label_score_threshold: float = 0.25) -> Dict:
104
+ """
105
+ Full classification of a single ticket:
106
+ - topic_tags: top_k labels from zero-shot (above threshold)
107
+ - sentiment: HF sentiment mapped
108
+ - priority: rule-based
109
+ """
110
+ text = " ".join([ticket.get("subject", ""), ticket.get("body", "")])
111
+ z = classify_topic_zero_shot(text)
112
+ labels = z.get("labels", [])
113
+ scores = z.get("scores", [])
114
+ # Collect top_k labels above threshold
115
+ topic_tags = []
116
+ for lbl, score in zip(labels, scores):
117
+ if score >= label_score_threshold:
118
+ topic_tags.append(lbl)
119
+ if len(topic_tags) >= top_k:
120
+ break
121
+ # fallback: if nothing passes threshold, take the top label
122
+ if not topic_tags and labels:
123
+ topic_tags = [labels[0]]
124
+
125
+ sentiment = classify_sentiment_hf(text)
126
+ priority = classify_priority(ticket.get("body",""), ticket.get("subject",""))
127
+
128
+ return {
129
+ "id": ticket.get("id"),
130
+ "topic_tags": topic_tags,
131
+ "topic_scores": {lbl: float(s) for lbl, s in zip(labels, scores)},
132
+ "sentiment": sentiment,
133
+ "priority": priority
134
+ }
135
+
136
+ # batch classify and save JSON
137
+ def classify_all_and_save(input_path: Union[str, Path] = "../sample_tickets.json", output_path: Union[str, Path] = "../classified_tickets_phase2.json"):
138
+ p_in = Path(__file__).parent.joinpath(input_path).resolve()
139
+ p_out = Path(__file__).parent.joinpath(output_path).resolve()
140
+ tickets = json.loads(p_in.read_text(encoding="utf-8"))
141
+ results = []
142
+ for t in tickets:
143
+ c = classify_ticket(t)
144
+ results.append({**t, "classification": c})
145
+ p_out.write_text(json.dumps(results, indent=2), encoding="utf-8")
146
+ print(f"Saved {len(results)} classified tickets to {p_out}")
147
+ return p_out
148
+
149
+ if __name__ == "__main__":
150
+ classify_all_and_save()
src/data_loader.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from typing import List, Dict
4
+
5
+ def load_tickets(path: str = "../sample_tickets.json") -> List[Dict]:
6
+ p = Path(__file__).parent.joinpath(path).resolve()
7
+ with open(p, "r", encoding="utf-8") as f:
8
+ tickets = json.load(f)
9
+ return tickets
10
+
11
+ if __name__ == "__main__":
12
+ tickets = load_tickets()
13
+ print(f"Loaded {len(tickets)} tickets")
14
+ # show first ticket
15
+ import pprint
16
+ pprint.pprint(tickets[0])
src/indexer.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/indexer.py
2
+ """
3
+ Index the cleaned corpus into FAISS using sentence-level/small-chunk passages.
4
+
5
+ Outputs:
6
+ - faiss_index.bin (FAISS index)
7
+ - docs_meta.jsonl (one JSON line per vector with fields: id, url, title, text)
8
+ """
9
+ from sentence_transformers import SentenceTransformer
10
+ import faiss
11
+ import ujson as json
12
+ from pathlib import Path
13
+ from tqdm import tqdm
14
+ import numpy as np
15
+ import re
16
+
17
+ CORPUS_PATH = Path(__file__).parent.parent.joinpath("docs_corpus.jsonl")
18
+ META_PATH = Path(__file__).parent.parent.joinpath("docs_meta.jsonl")
19
+ INDEX_PATH = Path(__file__).parent.parent.joinpath("faiss_index.bin")
20
+ EMBED_MODEL = "all-MiniLM-L6-v2"
21
+
22
+ # chunking by sentences, group up to N sentences per chunk (1-3 recommended)
23
+ MAX_SENTENCES_PER_CHUNK = 2
24
+
25
+ SENT_SPLIT_RE = re.compile(r'([.!?])\s+')
26
+
27
+ def split_into_sentences(text: str):
28
+ if not text:
29
+ return []
30
+ parts = SENT_SPLIT_RE.split(text)
31
+ sents = []
32
+ for i in range(0, len(parts), 2):
33
+ chunk = parts[i].strip()
34
+ punct = parts[i+1] if (i+1)<len(parts) else ""
35
+ sent = (chunk + punct).strip()
36
+ if sent:
37
+ sents.append(sent)
38
+ return sents
39
+
40
+ def build_index():
41
+ if not CORPUS_PATH.exists():
42
+ raise FileNotFoundError(f"Corpus not found at {CORPUS_PATH}. Run src/scrape_docs.py first.")
43
+
44
+ model = SentenceTransformer(EMBED_MODEL)
45
+ embeddings = []
46
+ meta = []
47
+ idx = 0
48
+
49
+ # read corpus and chunk into sentence groups
50
+ with CORPUS_PATH.open("r", encoding="utf-8") as f:
51
+ for line in tqdm(f, desc="Reading corpus"):
52
+ doc = json.loads(line)
53
+ url = doc.get("url")
54
+ title = doc.get("title","")
55
+ text = doc.get("text","")
56
+ sents = split_into_sentences(text)
57
+ if not sents:
58
+ continue
59
+ # group sentences into small chunks (1..MAX_SENTENCES_PER_CHUNK)
60
+ i = 0
61
+ while i < len(sents):
62
+ chunk_sents = sents[i:i+MAX_SENTENCES_PER_CHUNK]
63
+ chunk_text = " ".join(chunk_sents).strip()
64
+ if chunk_text:
65
+ meta.append({"id": idx, "url": url, "title": title, "text": chunk_text})
66
+ idx += 1
67
+ i += MAX_SENTENCES_PER_CHUNK
68
+
69
+ if not meta:
70
+ raise RuntimeError("No chunks created from corpus (empty corpus?)")
71
+
72
+ # encode in batches for memory efficiency
73
+ texts = [m["text"] for m in meta]
74
+ batch_size = 64
75
+ all_embs = []
76
+ for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
77
+ batch = texts[i:i+batch_size]
78
+ embs = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
79
+ all_embs.append(embs)
80
+ embeddings = np.vstack(all_embs).astype("float32")
81
+
82
+ # normalize to use inner-product as cosine
83
+ faiss.normalize_L2(embeddings)
84
+ d = embeddings.shape[1]
85
+ index = faiss.IndexFlatIP(d)
86
+ index.add(embeddings)
87
+ faiss.write_index(index, str(INDEX_PATH))
88
+ with META_PATH.open("w", encoding="utf-8") as f:
89
+ for m in meta:
90
+ f.write(json.dumps(m, ensure_ascii=False) + "\n")
91
+ print(f"Built index with {index.ntotal} vectors. Saved to {INDEX_PATH}, meta to {META_PATH}")
92
+
93
+ if __name__ == "__main__":
94
+ build_index()
src/rag.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/rag.py
2
+ """
3
+ RAG module (updated)
4
+ - FAISS retrieval (sentence-transformers embeddings)
5
+ - Cross-encoder reranker (optional)
6
+ - Prompt template with sentence-aware snippet trimming
7
+ - Generation (OpenAI preferred) or local Flan-T5 fallback
8
+ - Post-processing: concise N sentences, no trailing "..." and no placeholders
9
+ - Deduplication / diversity of contexts by URL
10
+ - Procedural snippet handling (take next sentence if top is a header/list)
11
+ """
12
+ from pathlib import Path
13
+ import ujson as json
14
+ import numpy as np
15
+ import textwrap
16
+ import os
17
+ import faiss
18
+ import re
19
+
20
+ # embeddings & models
21
+ from sentence_transformers import SentenceTransformer, CrossEncoder
22
+
23
+ try:
24
+ import openai
25
+ except Exception:
26
+ openai = None
27
+
28
+ # local generator (transformers)
29
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
30
+
31
+ ROOT = Path(__file__).parent.parent.resolve()
32
+ INDEX_PATH = ROOT.joinpath("faiss_index.bin")
33
+ META_PATH = ROOT.joinpath("docs_meta.jsonl")
34
+
35
+ EMBED_MODEL = "all-MiniLM-L6-v2" # embeddings model (fast)
36
+ CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2" # reranker
37
+
38
+ # local text generator model (CPU)
39
+ LOCAL_GEN_MODEL = "google/flan-t5-small"
40
+
41
+ # tune here: how many sentences to keep in final answer
42
+ MAX_ANSWER_SENTENCES = 2
43
+
44
+ # diversify: max chunks per url allowed in final top_candidates
45
+ MAX_CHUNKS_PER_URL = 2
46
+
47
+ # lazy-loaded resources
48
+ _index = None
49
+ _meta = None
50
+ _embed_model = None
51
+ _cross_encoder = None
52
+ _local_generator = None
53
+
54
+ # -------------------------
55
+ # Utilities: sentence helpers & tidy answer
56
+ # -------------------------
57
+ RE_SENT_SPLIT = re.compile(r'([.!?])\s+') # split and keep punctuation
58
+ RE_ANGLE_PLACEHOLDER = re.compile(r"<[^>]{1,200}>")
59
+ RE_DOUBLE_DASH_ID = re.compile(r"\b[a-zA-Z0-9_-]{3,}--\d{3,}\b")
60
+ RE_MULTI_DOTS = re.compile(r"\.{2,}")
61
+ RE_WHITESPACE = re.compile(r"\s+")
62
+
63
+ def split_into_sentences(text: str):
64
+ if not text:
65
+ return []
66
+ parts = RE_SENT_SPLIT.split(text)
67
+ sentences = []
68
+ for i in range(0, len(parts), 2):
69
+ chunk = parts[i].strip()
70
+ punct = parts[i+1] if (i+1)<len(parts) else ""
71
+ sentence = (chunk + punct).strip()
72
+ if sentence:
73
+ sentences.append(sentence)
74
+ return sentences
75
+
76
+ # -------------------------
77
+ # NEW: sentence-level extraction helpers with procedural handling
78
+ # -------------------------
79
+ def get_top_sentences_from_passage(passage_text: str, query: str, embed_model, top_n: int = 1):
80
+ """
81
+ Given a passage text, split into sentences and return top_n sentences by cosine similarity.
82
+ For 'procedural' outputs where the top sentence is a header/menu (short or 'how to'), also
83
+ include the next sentence to provide actionable content.
84
+ """
85
+ if not passage_text:
86
+ return []
87
+ sents = split_into_sentences(passage_text)
88
+ if not sents:
89
+ return []
90
+ if len(sents) <= top_n:
91
+ return sents[:top_n]
92
+
93
+ # embed query + sentences
94
+ q_emb = embed_model.encode([query], convert_to_numpy=True)
95
+ s_embs = embed_model.encode(sents, convert_to_numpy=True)
96
+
97
+ def norm(x):
98
+ n = np.linalg.norm(x)
99
+ return x / (n + 1e-10)
100
+ qn = norm(q_emb[0])
101
+ sims = [float(np.dot(qn, norm(se))) for se in s_embs]
102
+ idxs = sorted(range(len(sents)), key=lambda i: sims[i], reverse=True)[:top_n]
103
+
104
+ # if top sentence looks like a header/menu (very short, contains 'how to' or ends with ':'), also include the next sentence
105
+ out = []
106
+ for idx in idxs:
107
+ out.append(sents[idx])
108
+ # heuristic: if that sentence is short or contains "how to" or looks like heading, add next sentence if exists
109
+ s = sents[idx].lower()
110
+ word_count = len(s.split())
111
+ if (word_count <= 6 or 'how to' in s or s.endswith(':')) and (idx + 1) < len(sents):
112
+ out.append(sents[idx+1])
113
+ # dedupe while preserving order
114
+ seen = set()
115
+ final = []
116
+ for x in out:
117
+ if x not in seen:
118
+ final.append(x)
119
+ seen.add(x)
120
+ return final[:top_n]
121
+
122
+ def extract_fallback_from_contexts(contexts: list, query: str, n_sentences:int = 1) -> str:
123
+ """
124
+ Deterministic fallback: find the single best sentence across contexts and return it verbatim.
125
+ """
126
+ _, _, embed_model = load_index_and_meta()
127
+ best = None
128
+ best_score = -1.0
129
+ import numpy as np
130
+ q_emb = embed_model.encode([query], convert_to_numpy=True)[0]
131
+ def norm(x):
132
+ n=np.linalg.norm(x); return x/(n+1e-10)
133
+ qn = norm(q_emb)
134
+
135
+ for c in contexts:
136
+ sents = split_into_sentences(c.get("text",""))
137
+ if not sents:
138
+ continue
139
+ s_embs = embed_model.encode(sents, convert_to_numpy=True)
140
+ for s, se in zip(sents, s_embs):
141
+ sc = float(np.dot(qn, norm(se)))
142
+ if sc > best_score:
143
+ best_score = sc
144
+ best = s
145
+ if not best:
146
+ return ""
147
+ return best
148
+
149
+ def tidy_answer(ans: str, max_sentences: int = MAX_ANSWER_SENTENCES) -> str:
150
+ if not ans:
151
+ return ans
152
+ a = ans
153
+ a = RE_ANGLE_PLACEHOLDER.sub(" ", a)
154
+ a = RE_DOUBLE_DASH_ID.sub(" ", a)
155
+ a = a.replace("…", ". ")
156
+ a = RE_MULTI_DOTS.sub(". ", a)
157
+ a = RE_WHITESPACE.sub(" ", a).strip()
158
+ sents = split_into_sentences(a)
159
+ if not sents:
160
+ snippet = a[:300].strip()
161
+ if not snippet.endswith("."):
162
+ snippet = snippet.rstrip(" .,") + "."
163
+ return snippet
164
+ take = sents[:max_sentences]
165
+ out = " ".join(take).strip()
166
+ if out and out[-1] not in ".!?":
167
+ out = out.rstrip(" .,") + "."
168
+ return out
169
+
170
+ # -------------------------
171
+ # Loading helpers
172
+ # -------------------------
173
+ def load_index_and_meta():
174
+ global _index, _meta, _embed_model
175
+ if _index is None:
176
+ if not INDEX_PATH.exists():
177
+ raise FileNotFoundError(f"FAISS index not found at {INDEX_PATH}. Run src/indexer.py first.")
178
+ _index = faiss.read_index(str(INDEX_PATH))
179
+ if _meta is None:
180
+ if not META_PATH.exists():
181
+ raise FileNotFoundError(f"Meta file not found at {META_PATH}. Run src/indexer.py first.")
182
+ _meta = [json.loads(l) for l in META_PATH.read_text(encoding="utf-8").splitlines()]
183
+ if _embed_model is None:
184
+ _embed_model = SentenceTransformer(EMBED_MODEL)
185
+ return _index, _meta, _embed_model
186
+
187
+ def get_cross_encoder():
188
+ global _cross_encoder
189
+ if _cross_encoder is None:
190
+ _cross_encoder = CrossEncoder(CROSS_ENCODER_MODEL)
191
+ return _cross_encoder
192
+
193
+ def get_local_generator():
194
+ global _local_generator
195
+ if _local_generator is None:
196
+ tok = AutoTokenizer.from_pretrained(LOCAL_GEN_MODEL)
197
+ model = AutoModelForSeq2SeqLM.from_pretrained(LOCAL_GEN_MODEL)
198
+ _local_generator = pipeline("text2text-generation", model=model, tokenizer=tok, device=-1)
199
+ return _local_generator
200
+
201
+ # -------------------------
202
+ # Embedding + retrieval
203
+ # -------------------------
204
+ def embed_query(q: str):
205
+ _, _, em = load_index_and_meta()
206
+ emb = em.encode(q)
207
+ emb = np.asarray(emb, dtype="float32")
208
+ if emb.ndim == 1:
209
+ emb = emb.reshape(1, -1)
210
+ faiss.normalize_L2(emb)
211
+ return emb
212
+
213
+ def retrieve_candidates(query: str, top_k: int = 50):
214
+ index, meta, _ = load_index_and_meta()
215
+ emb = embed_query(query)
216
+ D, I = index.search(emb, top_k)
217
+ results = []
218
+ if len(I) == 0:
219
+ return results
220
+ for score, idx in zip(D[0], I[0]):
221
+ if idx < 0:
222
+ continue
223
+ m = meta[idx]
224
+ results.append({"score": float(score), "url": m["url"], "title": m.get("title",""), "text": m["text"], "id": idx})
225
+ return results
226
+
227
+ # -------------------------
228
+ # Reranking
229
+ # -------------------------
230
+ def rerank_with_cross(query: str, candidates: list, top_n: int = 5):
231
+ if not candidates:
232
+ return []
233
+ cross = get_cross_encoder()
234
+ inputs = [(query, c["text"]) for c in candidates]
235
+ try:
236
+ scores = cross.predict(inputs)
237
+ except Exception as e:
238
+ # fallback: if cross encoder fails, return top_n by original score
239
+ candidates.sort(key=lambda x: x.get("score",0), reverse=True)
240
+ return candidates[:top_n]
241
+ for c, s in zip(candidates, scores):
242
+ c["rerank_score"] = float(s)
243
+ candidates.sort(key=lambda x: x["rerank_score"], reverse=True)
244
+ return candidates[:top_n]
245
+
246
+ # -------------------------
247
+ # Snippet trimming helpers
248
+ # -------------------------
249
+ def trim_snippet_to_sentence(snippet: str, max_chars: int = 800) -> str:
250
+ if not snippet:
251
+ return snippet
252
+ s = snippet.replace("\n", " ").strip()
253
+ if len(s) <= max_chars:
254
+ return s
255
+ head = s[:max_chars]
256
+ last_dot = max(head.rfind("."), head.rfind("!"), head.rfind("?"))
257
+ if last_dot and last_dot > int(max_chars * 0.4):
258
+ return head[:last_dot+1].strip()
259
+ cut = head.rsplit(" ", 1)[0]
260
+ return cut.strip()
261
+
262
+ # -------------------------
263
+ # Prompt template (stronger)
264
+ # -------------------------
265
+ def build_prompt(query: str, contexts: list, sentences_per_context: int = 1):
266
+ """
267
+ Build a prompt that is explicit about producing an ACTIONABLE, concise answer.
268
+ We include one short sentence per source (selected by semantic similarity).
269
+ """
270
+ _, _, embed_model = load_index_and_meta()
271
+ sources_block = []
272
+ for i, c in enumerate(contexts, start=1):
273
+ passage = c.get("text", "").strip()
274
+ best_sents = get_top_sentences_from_passage(passage, query, embed_model, top_n=sentences_per_context)
275
+ snippet = " ".join(best_sents)
276
+ snippet = trim_snippet_to_sentence(snippet, max_chars=500)
277
+ snippet = snippet.rstrip(" .") + "." if snippet and snippet[-1] not in ".!?" else snippet
278
+ sources_block.append(f"[SRC_{i}] URL: {c.get('url')}\n[SRC_{i}] TEXT: {snippet}")
279
+
280
+ sources_text = "\n\n".join(sources_block)
281
+
282
+ prompt = textwrap.dedent(f"""
283
+ Use only the following snippets to produce a concise, ACTIONABLE answer (1-2 short sentences) that directly answers the question.
284
+ For "how-to" queries, produce concrete steps or exact fields to set where possible. Do NOT invent facts or add information not present in snippets.
285
+ If snippets do not contain an answer, reply: "I don't know — please consult the documentation." Then list the Source URLs used.
286
+
287
+ {sources_text}
288
+
289
+ Question:
290
+ {query}
291
+
292
+ Answer (be concise and then list Sources used as URLs):
293
+ """).strip()
294
+ return prompt
295
+
296
+ # -------------------------
297
+ # Generation (OpenAI or local) with fallback formatting
298
+ # -------------------------
299
+ def generate_answer_with_context(question: str, contexts: list, use_openai: bool = False):
300
+ prompt = build_prompt(question, contexts, sentences_per_context=1)
301
+
302
+ # Option A: OpenAI (preferred)
303
+ if use_openai and openai is not None and os.environ.get("OPENAI_API_KEY"):
304
+ try:
305
+ resp = openai.ChatCompletion.create(
306
+ model="gpt-3.5-turbo",
307
+ messages=[
308
+ {"role":"system","content":"You are a strict assistant. Use ONLY the provided documentation snippets to answer. Do not hallucinate."},
309
+ {"role":"user","content": prompt}
310
+ ],
311
+ temperature=0.0,
312
+ max_tokens=200,
313
+ stop=None
314
+ )
315
+ raw_answer = resp["choices"][0]["message"]["content"].strip()
316
+ answer = tidy_answer(raw_answer, max_sentences=MAX_ANSWER_SENTENCES)
317
+ # fallback if model returned unhelpful phrasing
318
+ if answer.lower().startswith("i'm") or "find the source" in answer.lower() or answer.lower().startswith("see"):
319
+ fallback = extract_fallback_from_contexts(contexts, question)
320
+ fallback = fallback.strip()
321
+ if fallback and not fallback.endswith(('.', '!', '?')):
322
+ fallback = fallback + '.'
323
+ if 'okta' in fallback.lower() or 'authenticator' in fallback.lower():
324
+ fallback = "Enable Okta SAML SSO: " + fallback
325
+ return tidy_answer(fallback, max_sentences=MAX_ANSWER_SENTENCES), [c["url"] for c in contexts]
326
+ used_urls = [c["url"] for c in contexts if c["url"] in raw_answer]
327
+ if not used_urls:
328
+ used_urls = [c["url"] for c in contexts]
329
+ return answer, used_urls
330
+ except Exception as e:
331
+ print("OpenAI generation failed:", e)
332
+
333
+ # Option B: Local generator fallback
334
+ gen = get_local_generator()
335
+ gen_kwargs = {
336
+ "max_length": 200,
337
+ "num_beams": 4,
338
+ "do_sample": False,
339
+ "no_repeat_ngram_size": 3,
340
+ "early_stopping": True
341
+ }
342
+ out = gen(prompt, **gen_kwargs)
343
+ raw_answer = out[0].get("generated_text","").strip()
344
+ answer = tidy_answer(raw_answer, max_sentences=MAX_ANSWER_SENTENCES)
345
+ if answer.lower().startswith("i'm") or "find the source" in answer.lower() or answer.lower().startswith("see"):
346
+ fallback = extract_fallback_from_contexts(contexts, question)
347
+ fallback = fallback.strip()
348
+ if fallback and not fallback.endswith(('.', '!', '?')):
349
+ fallback = fallback + '.'
350
+ if 'okta' in fallback.lower() or 'authenticator' in fallback.lower():
351
+ fallback = "Enable Okta SAML SSO: " + fallback
352
+ return tidy_answer(fallback, max_sentences=MAX_ANSWER_SENTENCES), [c["url"] for c in contexts]
353
+ return answer, [c["url"] for c in contexts]
354
+
355
+ # -------------------------
356
+ # Top-level handler with dedup/diversify
357
+ # -------------------------
358
+ def handle_rag_query(query: str, top_k: int = 5, use_openai: bool = False, rerank_candidates: int = 50):
359
+ candidates = retrieve_candidates(query, top_k=rerank_candidates)
360
+ if not candidates:
361
+ return {"answer": "No relevant documentation found.", "sources": [], "retrieved": []}
362
+
363
+ try:
364
+ top_candidates = rerank_with_cross(query, candidates, top_n=rerank_candidates)
365
+ except Exception as e:
366
+ print("Reranker failed, falling back to FAISS order:", e)
367
+ top_candidates = candidates[:rerank_candidates]
368
+
369
+ # Now pick final top_k diversified by URL:
370
+ # strategy: prefer at most MAX_CHUNKS_PER_URL per url; prefer higher rerank_score
371
+ # first, group by URL preserving order
372
+ url_counts = {}
373
+ diversified = []
374
+ for c in top_candidates:
375
+ url = c.get("url")
376
+ cnt = url_counts.get(url, 0)
377
+ if cnt < MAX_CHUNKS_PER_URL:
378
+ diversified.append(c)
379
+ url_counts[url] = cnt + 1
380
+ # stop early if we have enough
381
+ if len(diversified) >= max(top_k, len(top_candidates)):
382
+ break
383
+
384
+ # final trimming: ensure at most one chunk per URL until we fill top_k
385
+ seen_urls = set()
386
+ unique_candidates = []
387
+ for c in diversified:
388
+ u = c.get("url")
389
+ if u in seen_urls:
390
+ continue
391
+ unique_candidates.append(c)
392
+ seen_urls.add(u)
393
+ if len(unique_candidates) >= top_k:
394
+ break
395
+ # if we don't have enough unique URLs, allow second chunks (already in diversified)
396
+ if len(unique_candidates) < top_k:
397
+ # fill from diversified preserving order but skipping already selected items
398
+ for c in diversified:
399
+ if c in unique_candidates:
400
+ continue
401
+ unique_candidates.append(c)
402
+ if len(unique_candidates) >= top_k:
403
+ break
404
+ final_candidates = unique_candidates[:top_k]
405
+
406
+ # generate answer using final candidates
407
+ answer, urls = generate_answer_with_context(query, final_candidates, use_openai=use_openai)
408
+
409
+ return {"answer": answer, "sources": urls, "retrieved": final_candidates}
410
+
411
+ # small test if run as script
412
+ if __name__ == "__main__":
413
+ q = "How do I configure SAML SSO with Okta?"
414
+ print("Running test query:", q)
415
+ res = handle_rag_query(q, top_k=3, use_openai=False)
416
+ print("ANSWER:\n", res["answer"])
417
+ print("SOURCES:\n", res["sources"])
418
+ for r in res["retrieved"][:3]:
419
+ print("----\n", r["url"], "\n", r["text"][:300])
src/scrape_docs.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/scrape_docs.py
2
+ """
3
+ Crawl allowed Atlan docs and write a cleaned docs_corpus.jsonl.
4
+ Improvements:
5
+ - robust cleaning of encoding artifacts (utf-8 replace + ftfy optional)
6
+ - removes paragraph markers ¶, <placeholders>, group-id--digits tokens
7
+ - strips boilerplate lines and tiny nav lines
8
+ - collapses and normalizes whitespace / encoding
9
+ - removes script/style/header/footer/nav/form tags before extracting
10
+ Output: docs_corpus.jsonl (overwrites)
11
+ """
12
+ import requests
13
+ import html
14
+ import re
15
+ from bs4 import BeautifulSoup
16
+ from urllib.parse import urljoin, urlparse
17
+ from collections import deque
18
+ from pathlib import Path
19
+ from url_normalize import url_normalize
20
+ import ujson as json
21
+ from tqdm import tqdm
22
+
23
+ OUTPUT = Path(__file__).parent.parent.joinpath("docs_corpus.jsonl")
24
+ SEEDS = [
25
+ "https://docs.atlan.com/",
26
+ "https://developer.atlan.com/"
27
+ ]
28
+ ALLOWED_DOMAINS = {"docs.atlan.com", "developer.atlan.com"}
29
+ HEADERS = {"User-Agent": "atlan-rag-bot/0.1 (+your_email@example.com)"}
30
+
31
+ # heuristics
32
+ MIN_LINE_WORDS = 3
33
+ MIN_PAGE_WORDS = 30
34
+
35
+ # regex cleanup
36
+ RE_CONTROL = re.compile(r"[\x00-\x1f\x7f-\x9f]")
37
+ RE_PARAGRAPH_MARK = re.compile(r"¶")
38
+ RE_ANGLE_PLACEHOLDER = re.compile(r"<[^>\n]{1,200}>")
39
+ RE_DOUBLE_DASH_ID = re.compile(r"\b[a-zA-Z0-9_-]{3,}--\d{3,}\b")
40
+ RE_MULTIPLE_SPACES = re.compile(r"\s+")
41
+ RE_REPEATED_CHAR = re.compile(r"(.)\1{5,}") # long repeated chars
42
+ RE_BAD_ELLIPSIS = re.compile(r"\.{2,}") # multiple dots
43
+
44
+ BOILERPLATE_KEYWORDS = [
45
+ "table of contents", "overview", "read more", "privacy", "terms", "©", "cookie",
46
+ "search", "related articles", "last updated", "release notes", "subscribe", "breadcrumb"
47
+ ]
48
+
49
+ # optional: try to import ftfy for robust fixes (if installed)
50
+ try:
51
+ import ftfy
52
+ except Exception:
53
+ ftfy = None
54
+
55
+
56
+ def is_allowed(url):
57
+ try:
58
+ return urlparse(url).netloc in ALLOWED_DOMAINS
59
+ except:
60
+ return False
61
+
62
+ def _keep_line(line: str) -> bool:
63
+ s = line.strip().lower()
64
+ if not s:
65
+ return False
66
+ if len(s.split()) < MIN_LINE_WORDS:
67
+ return False
68
+ if s.startswith("http") or s.startswith("www."):
69
+ return False
70
+ for k in BOILERPLATE_KEYWORDS:
71
+ if k in s:
72
+ return False
73
+ # short code-like lines
74
+ if len(s) < 10 and any(ch in s for ch in ['/', '.', '#']):
75
+ return False
76
+ return True
77
+
78
+ def clean_text(soup):
79
+ # remove undesired blocks
80
+ for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
81
+ tag.decompose()
82
+ parts = []
83
+ # only consider headings, paragraphs and list items
84
+ for el in soup.find_all(["h1", "h2", "h3", "p", "li"]):
85
+ t = el.get_text(separator=" ", strip=True)
86
+ if not t:
87
+ continue
88
+ # HTML unescape
89
+ t = html.unescape(t)
90
+ # remove paragraph mark and placeholders
91
+ t = RE_PARAGRAPH_MARK.sub(" ", t)
92
+ t = RE_ANGLE_PLACEHOLDER.sub(" ", t)
93
+ t = RE_DOUBLE_DASH_ID.sub(" ", t)
94
+ # remove control chars
95
+ t = RE_CONTROL.sub(" ", t)
96
+ # remove excessive repeated chars
97
+ t = RE_REPEATED_CHAR.sub(" ", t)
98
+ # normalize ellipsis
99
+ t = RE_BAD_ELLIPSIS.sub(". ", t)
100
+ # collapse whitespace
101
+ t = RE_MULTIPLE_SPACES.sub(" ", t).strip()
102
+ if _keep_line(t):
103
+ parts.append(t)
104
+ joined = "\n\n".join(parts).strip()
105
+ # final normalization: force utf-8 safe output & fix broken chars
106
+ joined = joined.encode('utf-8', errors='replace').decode('utf-8')
107
+ joined = joined.replace("\ufffd", " ")
108
+ # optional stronger fix using ftfy if available
109
+ if ftfy is not None:
110
+ joined = ftfy.fix_text(joined)
111
+ # Remove common weird bytes sequences left by encoding (Â, â etc.)
112
+ joined = joined.replace("Â", "").replace("â", "")
113
+ joined = RE_MULTIPLE_SPACES.sub(" ", joined).strip()
114
+ return joined
115
+
116
+ def crawl(seeds=SEEDS, max_pages=1000, max_depth=2):
117
+ seen = set()
118
+ out = []
119
+ q = deque()
120
+ for s in seeds:
121
+ q.append((s, 0))
122
+ pbar = tqdm(total=max_pages, desc="Crawl", unit="page")
123
+ while q and len(out) < max_pages:
124
+ url, depth = q.popleft()
125
+ url = url_normalize(url)
126
+ if url in seen:
127
+ continue
128
+ if depth > max_depth:
129
+ continue
130
+ if not is_allowed(url):
131
+ seen.add(url)
132
+ continue
133
+ try:
134
+ r = requests.get(url, headers=HEADERS, timeout=12)
135
+ if r.status_code != 200:
136
+ seen.add(url)
137
+ continue
138
+ soup = BeautifulSoup(r.text, "html.parser")
139
+ title = soup.title.string.strip() if soup.title else url
140
+ text = clean_text(soup)
141
+ if text and len(text.split()) >= MIN_PAGE_WORDS:
142
+ out.append({"url": url, "title": title, "text": text})
143
+ pbar.update(1)
144
+ seen.add(url)
145
+ # find links
146
+ for a in soup.find_all("a", href=True):
147
+ href = urljoin(url, a["href"])
148
+ href = url_normalize(href)
149
+ if is_allowed(href) and href not in seen:
150
+ # skip common media files
151
+ if any(href.lower().endswith(ext) for ext in [".pdf", ".zip", ".png", ".jpg", ".jpeg", ".svg"]):
152
+ continue
153
+ q.append((href, depth + 1))
154
+ except Exception as e:
155
+ # keep going
156
+ seen.add(url)
157
+ continue
158
+ pbar.close()
159
+ # write JSONL (overwrite)
160
+ with OUTPUT.open("w", encoding="utf-8") as f:
161
+ for doc in out:
162
+ f.write(json.dumps(doc, ensure_ascii=False) + "\n")
163
+ print(f"Wrote {len(out)} docs to {OUTPUT}")
164
+
165
+ if __name__ == "__main__":
166
+ crawl(max_pages=400, max_depth=2)
streamlit_app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # streamlit_app.py
2
+ """
3
+ Wrapper so Hugging Face Spaces (Streamlit SDK) can launch the app.
4
+ It simply runs src/app.py as if it were the main file.
5
+ """
6
+
7
+ from pathlib import Path
8
+ import runpy
9
+
10
+ # Run src/app.py as the main script
11
+ runpy.run_path(str(Path(__file__).parent.joinpath("src", "app.py")), run_name="__main__")