Spaces:
Sleeping
Sleeping
dasdebanna
commited on
Commit
·
37a70cc
1
Parent(s):
59eb9c1
Prepare app for Hugging Face Space (include index files)
Browse files- .env +0 -0
- classified_tickets_phase2.json +724 -0
- docs_corpus.jsonl +0 -0
- docs_meta.jsonl +0 -0
- faiss_index.bin +3 -0
- sample_tickets.json +152 -0
- src/__pycache__/classifier.cpython-313.pyc +0 -0
- src/__pycache__/data_loader.cpython-313.pyc +0 -0
- src/__pycache__/rag.cpython-313.pyc +0 -0
- src/app.py +228 -0
- src/classifier.py +150 -0
- src/data_loader.py +16 -0
- src/indexer.py +94 -0
- src/rag.py +419 -0
- src/scrape_docs.py +166 -0
- streamlit_app.py +11 -0
.env
ADDED
|
File without changes
|
classified_tickets_phase2.json
ADDED
|
@@ -0,0 +1,724 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "TICKET-245",
|
| 4 |
+
"subject": "Connecting Snowflake to Atlan - required permissions?",
|
| 5 |
+
"body": "Hi team, we're trying to set up our primary Snowflake production database as a new source in Atlan, but the connection keeps failing. We've tried using our standard service account, but it's not working. Our entire BI team is blocked on this integration for a major upcoming project, so it's quite urgent. Could you please provide a definitive list of the exact permissions and credentials needed on the Snowflake side to get this working? Thanks.",
|
| 6 |
+
"classification": {
|
| 7 |
+
"id": "TICKET-245",
|
| 8 |
+
"topic_tags": [
|
| 9 |
+
"Product"
|
| 10 |
+
],
|
| 11 |
+
"topic_scores": {
|
| 12 |
+
"Product": 0.2536420226097107,
|
| 13 |
+
"Connector": 0.21206863224506378,
|
| 14 |
+
"Sensitive data": 0.15129192173480988,
|
| 15 |
+
"SSO": 0.09911767393350601,
|
| 16 |
+
"How-to": 0.09207943081855774,
|
| 17 |
+
"Best practices": 0.06830797344446182,
|
| 18 |
+
"Glossary": 0.05438947677612305,
|
| 19 |
+
"API/SDK": 0.047063831239938736,
|
| 20 |
+
"Lineage": 0.022039052098989487
|
| 21 |
+
},
|
| 22 |
+
"sentiment": "Angry",
|
| 23 |
+
"priority": "P0"
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"id": "TICKET-246",
|
| 28 |
+
"subject": "Which connectors automatically capture lineage?",
|
| 29 |
+
"body": "Hello, I'm new to Atlan and trying to understand the lineage capabilities. The documentation mentions automatic lineage, but it's not clear which of our connectors (we use Fivetran, dbt, and Tableau) support this out-of-the-box. We need to present a clear picture of our data flow to leadership next week. Can you explain how lineage capture differs for these tools?",
|
| 30 |
+
"classification": {
|
| 31 |
+
"id": "TICKET-246",
|
| 32 |
+
"topic_tags": [
|
| 33 |
+
"Lineage"
|
| 34 |
+
],
|
| 35 |
+
"topic_scores": {
|
| 36 |
+
"Lineage": 0.7043173909187317,
|
| 37 |
+
"Product": 0.06416956335306168,
|
| 38 |
+
"Sensitive data": 0.05306507274508476,
|
| 39 |
+
"Connector": 0.05121781677007675,
|
| 40 |
+
"How-to": 0.03940143436193466,
|
| 41 |
+
"Best practices": 0.03072594851255417,
|
| 42 |
+
"SSO": 0.02197032794356346,
|
| 43 |
+
"Glossary": 0.018482282757759094,
|
| 44 |
+
"API/SDK": 0.01665017567574978
|
| 45 |
+
},
|
| 46 |
+
"sentiment": "Angry",
|
| 47 |
+
"priority": "P1"
|
| 48 |
+
}
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"id": "TICKET-247",
|
| 52 |
+
"subject": "Deployment of Atlan agent for private data lake",
|
| 53 |
+
"body": "Our primary data lake is hosted on-premise within a secure VPC and is not exposed to the internet. We understand we need to use the Atlan agent for this, but the setup instructions are a bit confusing for our security team. This is a critical source for us, and we can't proceed with our rollout until we get this connected. Can you provide a detailed deployment guide or connect us with a technical expert?",
|
| 54 |
+
"classification": {
|
| 55 |
+
"id": "TICKET-247",
|
| 56 |
+
"topic_tags": [
|
| 57 |
+
"Sensitive data"
|
| 58 |
+
],
|
| 59 |
+
"topic_scores": {
|
| 60 |
+
"Sensitive data": 0.4692264795303345,
|
| 61 |
+
"How-to": 0.13292741775512695,
|
| 62 |
+
"Product": 0.1230754628777504,
|
| 63 |
+
"SSO": 0.06440062075853348,
|
| 64 |
+
"Connector": 0.058400604873895645,
|
| 65 |
+
"Best practices": 0.05110899731516838,
|
| 66 |
+
"Glossary": 0.04289798066020012,
|
| 67 |
+
"API/SDK": 0.036751993000507355,
|
| 68 |
+
"Lineage": 0.021210432052612305
|
| 69 |
+
},
|
| 70 |
+
"sentiment": "Angry",
|
| 71 |
+
"priority": "P0"
|
| 72 |
+
}
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"id": "TICKET-248",
|
| 76 |
+
"subject": "How to surface sample rows and schema changes?",
|
| 77 |
+
"body": "Hi, we've successfully connected our Redshift cluster, and the assets are showing up. However, my data analysts are asking how they can see sample data or recent schema changes directly within Atlan without having to go back to Redshift. Is this feature available? I feel like I'm missing something obvious.",
|
| 78 |
+
"classification": {
|
| 79 |
+
"id": "TICKET-248",
|
| 80 |
+
"topic_tags": [
|
| 81 |
+
"How-to"
|
| 82 |
+
],
|
| 83 |
+
"topic_scores": {
|
| 84 |
+
"How-to": 0.5293518900871277,
|
| 85 |
+
"Sensitive data": 0.12791359424591064,
|
| 86 |
+
"Connector": 0.11278416216373444,
|
| 87 |
+
"Product": 0.07702862471342087,
|
| 88 |
+
"Best practices": 0.035295821726322174,
|
| 89 |
+
"SSO": 0.03469147905707359,
|
| 90 |
+
"API/SDK": 0.030026594176888466,
|
| 91 |
+
"Lineage": 0.026599474251270294,
|
| 92 |
+
"Glossary": 0.0263083353638649
|
| 93 |
+
},
|
| 94 |
+
"sentiment": "Angry",
|
| 95 |
+
"priority": "P2"
|
| 96 |
+
}
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"id": "TICKET-249",
|
| 100 |
+
"subject": "Exporting lineage view for a specific table",
|
| 101 |
+
"body": "For our quarterly audit, I need to provide a complete upstream and downstream lineage diagram for our core `fact_orders` table. I can see the lineage perfectly in the UI, but I can't find an option to export this view as an image or PDF. This is a hard requirement from our compliance team and the deadline is approaching fast. Please help!",
|
| 102 |
+
"classification": {
|
| 103 |
+
"id": "TICKET-249",
|
| 104 |
+
"topic_tags": [
|
| 105 |
+
"Lineage"
|
| 106 |
+
],
|
| 107 |
+
"topic_scores": {
|
| 108 |
+
"Lineage": 0.6261916756629944,
|
| 109 |
+
"Sensitive data": 0.10166331380605698,
|
| 110 |
+
"Product": 0.08675874769687653,
|
| 111 |
+
"How-to": 0.05723349004983902,
|
| 112 |
+
"Best practices": 0.038304269313812256,
|
| 113 |
+
"Connector": 0.025699065998196602,
|
| 114 |
+
"SSO": 0.02333613485097885,
|
| 115 |
+
"Glossary": 0.022767778486013412,
|
| 116 |
+
"API/SDK": 0.018045460805296898
|
| 117 |
+
},
|
| 118 |
+
"sentiment": "Angry",
|
| 119 |
+
"priority": "P0"
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"id": "TICKET-250",
|
| 124 |
+
"subject": "Importing lineage from Airflow jobs",
|
| 125 |
+
"body": "We run hundreds of ETL jobs in Airflow, and we need to see that lineage reflected in Atlan. I've read that Atlan can integrate with Airflow, but how do we configure it to correctly map our DAGs to the specific datasets they are transforming? The current documentation is a bit high-level.",
|
| 126 |
+
"classification": {
|
| 127 |
+
"id": "TICKET-250",
|
| 128 |
+
"topic_tags": [
|
| 129 |
+
"Lineage"
|
| 130 |
+
],
|
| 131 |
+
"topic_scores": {
|
| 132 |
+
"Lineage": 0.6405794024467468,
|
| 133 |
+
"How-to": 0.07339552789926529,
|
| 134 |
+
"Sensitive data": 0.06700912863016129,
|
| 135 |
+
"Product": 0.04682222753763199,
|
| 136 |
+
"Best practices": 0.03841668739914894,
|
| 137 |
+
"SSO": 0.03800088167190552,
|
| 138 |
+
"API/SDK": 0.03554345667362213,
|
| 139 |
+
"Glossary": 0.03051498532295227,
|
| 140 |
+
"Connector": 0.029717685654759407
|
| 141 |
+
},
|
| 142 |
+
"sentiment": "Positive",
|
| 143 |
+
"priority": "P1"
|
| 144 |
+
}
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"id": "TICKET-251",
|
| 148 |
+
"subject": "Using the Visual Query Builder",
|
| 149 |
+
"body": "I'm a business analyst and not very comfortable with writing complex SQL. I was excited to see the Visual Query Builder in Atlan, but I'm having trouble figuring out how to join multiple tables and save my query for later use. Is there a tutorial or a quick guide you can point me to?",
|
| 150 |
+
"classification": {
|
| 151 |
+
"id": "TICKET-251",
|
| 152 |
+
"topic_tags": [
|
| 153 |
+
"How-to"
|
| 154 |
+
],
|
| 155 |
+
"topic_scores": {
|
| 156 |
+
"How-to": 0.20408794283866882,
|
| 157 |
+
"Sensitive data": 0.19889409840106964,
|
| 158 |
+
"Best practices": 0.1357269585132599,
|
| 159 |
+
"Product": 0.1232258677482605,
|
| 160 |
+
"Glossary": 0.08558867126703262,
|
| 161 |
+
"Connector": 0.07553622126579285,
|
| 162 |
+
"API/SDK": 0.06800619512796402,
|
| 163 |
+
"SSO": 0.06174485385417938,
|
| 164 |
+
"Lineage": 0.047189224511384964
|
| 165 |
+
},
|
| 166 |
+
"sentiment": "Angry",
|
| 167 |
+
"priority": "P2"
|
| 168 |
+
}
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"id": "TICKET-252",
|
| 172 |
+
"subject": "Programmatic extraction of lineage",
|
| 173 |
+
"body": "Our internal data science team wants to build a custom application that analyzes metadata propagation delays. To do this, we need to programmatically extract lineage data from Atlan via an API. Does the API expose lineage information, and if so, could you provide an example of the endpoint and the structure of the response?",
|
| 174 |
+
"classification": {
|
| 175 |
+
"id": "TICKET-252",
|
| 176 |
+
"topic_tags": [
|
| 177 |
+
"Lineage"
|
| 178 |
+
],
|
| 179 |
+
"topic_scores": {
|
| 180 |
+
"Lineage": 0.40903380513191223,
|
| 181 |
+
"How-to": 0.12547598779201508,
|
| 182 |
+
"Sensitive data": 0.12143483757972717,
|
| 183 |
+
"Product": 0.12053452432155609,
|
| 184 |
+
"Best practices": 0.0685185045003891,
|
| 185 |
+
"API/SDK": 0.06657088547945023,
|
| 186 |
+
"Glossary": 0.03478899598121643,
|
| 187 |
+
"SSO": 0.029303470626473427,
|
| 188 |
+
"Connector": 0.024339014664292336
|
| 189 |
+
},
|
| 190 |
+
"sentiment": "Angry",
|
| 191 |
+
"priority": "P1"
|
| 192 |
+
}
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"id": "TICKET-253",
|
| 196 |
+
"subject": "Upstream lineage to Snowflake view not working",
|
| 197 |
+
"body": "This is infuriating. We have a critical Snowflake view, `finance.daily_revenue`, that is built from three upstream tables. Atlan is correctly showing the downstream dependencies, but the upstream lineage is completely missing. This makes the view untrustworthy for our analysts. We've re-run the crawler multiple times. What could be causing this? This is a huge problem for us.",
|
| 198 |
+
"classification": {
|
| 199 |
+
"id": "TICKET-253",
|
| 200 |
+
"topic_tags": [
|
| 201 |
+
"Lineage"
|
| 202 |
+
],
|
| 203 |
+
"topic_scores": {
|
| 204 |
+
"Lineage": 0.6192259192466736,
|
| 205 |
+
"Sensitive data": 0.14630955457687378,
|
| 206 |
+
"Product": 0.08304131031036377,
|
| 207 |
+
"SSO": 0.03572985529899597,
|
| 208 |
+
"Glossary": 0.03130059689283371,
|
| 209 |
+
"How-to": 0.02567395195364952,
|
| 210 |
+
"Best practices": 0.025151418522000313,
|
| 211 |
+
"Connector": 0.018504904583096504,
|
| 212 |
+
"API/SDK": 0.015062461607158184
|
| 213 |
+
},
|
| 214 |
+
"sentiment": "Angry",
|
| 215 |
+
"priority": "P0"
|
| 216 |
+
}
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"id": "TICKET-254",
|
| 220 |
+
"subject": "How to create a business glossary and link terms in bulk?",
|
| 221 |
+
"body": "We are migrating our existing business glossary from a spreadsheet into Atlan. We have over 500 terms. Manually creating each one and linking them to thousands of assets seems impossible. Is there a bulk import feature using CSV or an API to create terms and link them to assets? This is blocking our entire governance initiative.",
|
| 222 |
+
"classification": {
|
| 223 |
+
"id": "TICKET-254",
|
| 224 |
+
"topic_tags": [
|
| 225 |
+
"How-to",
|
| 226 |
+
"Glossary"
|
| 227 |
+
],
|
| 228 |
+
"topic_scores": {
|
| 229 |
+
"How-to": 0.4522898495197296,
|
| 230 |
+
"Glossary": 0.4390662610530853,
|
| 231 |
+
"Sensitive data": 0.021999819204211235,
|
| 232 |
+
"Product": 0.01799585483968258,
|
| 233 |
+
"SSO": 0.017875080928206444,
|
| 234 |
+
"Best practices": 0.016796141862869263,
|
| 235 |
+
"API/SDK": 0.01395806111395359,
|
| 236 |
+
"Lineage": 0.010493746027350426,
|
| 237 |
+
"Connector": 0.009525217115879059
|
| 238 |
+
},
|
| 239 |
+
"sentiment": "Angry",
|
| 240 |
+
"priority": "P2"
|
| 241 |
+
}
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"id": "TICKET-255",
|
| 245 |
+
"subject": "Creating a custom role for data stewards",
|
| 246 |
+
"body": "I'm trying to set up a custom role for our data stewards. They need permission to edit descriptions and link glossary terms, but they should NOT have permission to run queries or change connection settings. I'm looking at the default roles, but none of them fit perfectly. How can I create a new role with this specific set of permissions?",
|
| 247 |
+
"classification": {
|
| 248 |
+
"id": "TICKET-255",
|
| 249 |
+
"topic_tags": [
|
| 250 |
+
"How-to"
|
| 251 |
+
],
|
| 252 |
+
"topic_scores": {
|
| 253 |
+
"How-to": 0.25342780351638794,
|
| 254 |
+
"Glossary": 0.2120532989501953,
|
| 255 |
+
"Product": 0.1341620683670044,
|
| 256 |
+
"Sensitive data": 0.08589652925729752,
|
| 257 |
+
"Connector": 0.08091939240694046,
|
| 258 |
+
"Best practices": 0.07565046101808548,
|
| 259 |
+
"API/SDK": 0.05833519995212555,
|
| 260 |
+
"SSO": 0.055307161062955856,
|
| 261 |
+
"Lineage": 0.044248051941394806
|
| 262 |
+
},
|
| 263 |
+
"sentiment": "Angry",
|
| 264 |
+
"priority": "P1"
|
| 265 |
+
}
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"id": "TICKET-256",
|
| 269 |
+
"subject": "Mapping Active Directory groups to Atlan teams",
|
| 270 |
+
"body": "Our company policy requires us to manage all user access through Active Directory groups. We need to map our existing AD groups (e.g., 'data-analyst-finance', 'data-engineer-core') to teams within Atlan to automatically grant the correct permissions. I can't find the settings for this. How is this configured?",
|
| 271 |
+
"classification": {
|
| 272 |
+
"id": "TICKET-256",
|
| 273 |
+
"topic_tags": [
|
| 274 |
+
"How-to"
|
| 275 |
+
],
|
| 276 |
+
"topic_scores": {
|
| 277 |
+
"How-to": 0.24342772364616394,
|
| 278 |
+
"Sensitive data": 0.19628578424453735,
|
| 279 |
+
"Best practices": 0.12237202376127243,
|
| 280 |
+
"Product": 0.11746875196695328,
|
| 281 |
+
"Glossary": 0.08366748690605164,
|
| 282 |
+
"SSO": 0.077210932970047,
|
| 283 |
+
"Connector": 0.0679486095905304,
|
| 284 |
+
"API/SDK": 0.04821234196424484,
|
| 285 |
+
"Lineage": 0.04340638965368271
|
| 286 |
+
},
|
| 287 |
+
"sentiment": "Angry",
|
| 288 |
+
"priority": "P0"
|
| 289 |
+
}
|
| 290 |
+
},
|
| 291 |
+
{
|
| 292 |
+
"id": "TICKET-257",
|
| 293 |
+
"subject": "RBAC for assets vs. glossaries",
|
| 294 |
+
"body": "I need clarification on how Atlan's role-based access control works. If a user is denied access to a specific Snowflake schema, can they still see the glossary terms that are linked to the tables in that schema? I need to ensure our PII governance is airtight.",
|
| 295 |
+
"classification": {
|
| 296 |
+
"id": "TICKET-257",
|
| 297 |
+
"topic_tags": [
|
| 298 |
+
"Sensitive data",
|
| 299 |
+
"Glossary"
|
| 300 |
+
],
|
| 301 |
+
"topic_scores": {
|
| 302 |
+
"Sensitive data": 0.5029798746109009,
|
| 303 |
+
"Glossary": 0.325946182012558,
|
| 304 |
+
"How-to": 0.03934895619750023,
|
| 305 |
+
"Best practices": 0.03307223320007324,
|
| 306 |
+
"Product": 0.0300066489726305,
|
| 307 |
+
"Lineage": 0.020622894167900085,
|
| 308 |
+
"SSO": 0.019720233976840973,
|
| 309 |
+
"Connector": 0.018421567976474762,
|
| 310 |
+
"API/SDK": 0.009881414473056793
|
| 311 |
+
},
|
| 312 |
+
"sentiment": "Angry",
|
| 313 |
+
"priority": "P1"
|
| 314 |
+
}
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"id": "TICKET-258",
|
| 318 |
+
"subject": "Process for onboarding asset owners",
|
| 319 |
+
"body": "We've started identifying owners for our key data assets. What is the recommended workflow in Atlan to assign these owners and automatically notify them? We want to make sure they are aware of their responsibilities without us having to send manual emails for every assignment.",
|
| 320 |
+
"classification": {
|
| 321 |
+
"id": "TICKET-258",
|
| 322 |
+
"topic_tags": [
|
| 323 |
+
"How-to"
|
| 324 |
+
],
|
| 325 |
+
"topic_scores": {
|
| 326 |
+
"How-to": 0.2538345754146576,
|
| 327 |
+
"Sensitive data": 0.242264062166214,
|
| 328 |
+
"Best practices": 0.15336112678050995,
|
| 329 |
+
"Product": 0.11627519130706787,
|
| 330 |
+
"SSO": 0.0775536373257637,
|
| 331 |
+
"Glossary": 0.055225685238838196,
|
| 332 |
+
"Connector": 0.043340008705854416,
|
| 333 |
+
"API/SDK": 0.03405303508043289,
|
| 334 |
+
"Lineage": 0.02409267984330654
|
| 335 |
+
},
|
| 336 |
+
"sentiment": "Positive",
|
| 337 |
+
"priority": "P2"
|
| 338 |
+
}
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"id": "TICKET-259",
|
| 342 |
+
"subject": "How does Atlan surface sensitive fields like PII?",
|
| 343 |
+
"body": "Our security team is evaluating Atlan and their main question is around PII and sensitive data. How does Atlan automatically identify fields containing PII? What are our options to apply tags or masks to these fields once they are identified to prevent unauthorized access?",
|
| 344 |
+
"classification": {
|
| 345 |
+
"id": "TICKET-259",
|
| 346 |
+
"topic_tags": [
|
| 347 |
+
"Sensitive data"
|
| 348 |
+
],
|
| 349 |
+
"topic_scores": {
|
| 350 |
+
"Sensitive data": 0.8192521333694458,
|
| 351 |
+
"How-to": 0.09408269077539444,
|
| 352 |
+
"Product": 0.022458547726273537,
|
| 353 |
+
"SSO": 0.01460132747888565,
|
| 354 |
+
"Best practices": 0.013057924807071686,
|
| 355 |
+
"Connector": 0.011657687835395336,
|
| 356 |
+
"Glossary": 0.008879464119672775,
|
| 357 |
+
"Lineage": 0.008671694435179234,
|
| 358 |
+
"API/SDK": 0.0073384749703109264
|
| 359 |
+
},
|
| 360 |
+
"sentiment": "Angry",
|
| 361 |
+
"priority": "P2"
|
| 362 |
+
}
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"id": "TICKET-260",
|
| 366 |
+
"subject": "Authentication methods for APIs and SDKs",
|
| 367 |
+
"body": "We are planning to build several automations using the Atlan API and Python SDK. What authentication methods are supported? Is it just API keys, or can we use something like OAuth? We have a strict policy that requires key rotation every 90 days, so we need to understand how to manage this programmatically.",
|
| 368 |
+
"classification": {
|
| 369 |
+
"id": "TICKET-260",
|
| 370 |
+
"topic_tags": [
|
| 371 |
+
"API/SDK"
|
| 372 |
+
],
|
| 373 |
+
"topic_scores": {
|
| 374 |
+
"API/SDK": 0.876763105392456,
|
| 375 |
+
"Product": 0.029590053483843803,
|
| 376 |
+
"How-to": 0.02613384835422039,
|
| 377 |
+
"Sensitive data": 0.022094866260886192,
|
| 378 |
+
"Best practices": 0.01735789142549038,
|
| 379 |
+
"Glossary": 0.012602042406797409,
|
| 380 |
+
"Connector": 0.005761501379311085,
|
| 381 |
+
"SSO": 0.005750404670834541,
|
| 382 |
+
"Lineage": 0.003946291748434305
|
| 383 |
+
},
|
| 384 |
+
"sentiment": "Angry",
|
| 385 |
+
"priority": "P1"
|
| 386 |
+
}
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"id": "TICKET-261",
|
| 390 |
+
"subject": "Enabling and testing SAML SSO",
|
| 391 |
+
"body": "We are ready to enable SAML SSO with our Okta instance. However, we are very concerned about disrupting our active users if the configuration is wrong. Is there a way to test the SSO configuration for a specific user or group before we enable it for the entire workspace?",
|
| 392 |
+
"classification": {
|
| 393 |
+
"id": "TICKET-261",
|
| 394 |
+
"topic_tags": [
|
| 395 |
+
"SSO"
|
| 396 |
+
],
|
| 397 |
+
"topic_scores": {
|
| 398 |
+
"SSO": 0.8798295855522156,
|
| 399 |
+
"Sensitive data": 0.024509701877832413,
|
| 400 |
+
"Best practices": 0.023383136838674545,
|
| 401 |
+
"Product": 0.02332100085914135,
|
| 402 |
+
"How-to": 0.015699787065386772,
|
| 403 |
+
"Glossary": 0.010842178016901016,
|
| 404 |
+
"Connector": 0.008763168007135391,
|
| 405 |
+
"Lineage": 0.006830730475485325,
|
| 406 |
+
"API/SDK": 0.006820705719292164
|
| 407 |
+
},
|
| 408 |
+
"sentiment": "Angry",
|
| 409 |
+
"priority": "P2"
|
| 410 |
+
}
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"id": "TICKET-262",
|
| 414 |
+
"subject": "SSO login not assigning user to correct group",
|
| 415 |
+
"body": "I've just had a new user, 'test.user@company.com', log in via our newly configured SSO. They were authenticated successfully, but they were not added to the 'Data Analysts' group as expected based on our SAML assertions. This is preventing them from accessing any assets. What could be the reason for this mis-assignment?",
|
| 416 |
+
"classification": {
|
| 417 |
+
"id": "TICKET-262",
|
| 418 |
+
"topic_tags": [
|
| 419 |
+
"SSO"
|
| 420 |
+
],
|
| 421 |
+
"topic_scores": {
|
| 422 |
+
"SSO": 0.857302725315094,
|
| 423 |
+
"Sensitive data": 0.049394093453884125,
|
| 424 |
+
"Product": 0.021338628605008125,
|
| 425 |
+
"Best practices": 0.01603107526898384,
|
| 426 |
+
"Connector": 0.015189331956207752,
|
| 427 |
+
"Lineage": 0.011370034888386726,
|
| 428 |
+
"How-to": 0.010868269950151443,
|
| 429 |
+
"Glossary": 0.01049866247922182,
|
| 430 |
+
"API/SDK": 0.008007260970771313
|
| 431 |
+
},
|
| 432 |
+
"sentiment": "Angry",
|
| 433 |
+
"priority": "P2"
|
| 434 |
+
}
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"id": "TICKET-263",
|
| 438 |
+
"subject": "Integration with existing DLP or secrets manager",
|
| 439 |
+
"body": "Does Atlan have the capability to integrate with third-party tools like a DLP (Data Loss Prevention) solution or a secrets manager like HashiCorp Vault? We need to ensure that connection credentials and sensitive metadata classifications are handled by our central security systems.",
|
| 440 |
+
"classification": {
|
| 441 |
+
"id": "TICKET-263",
|
| 442 |
+
"topic_tags": [
|
| 443 |
+
"Sensitive data"
|
| 444 |
+
],
|
| 445 |
+
"topic_scores": {
|
| 446 |
+
"Sensitive data": 0.42641857266426086,
|
| 447 |
+
"Product": 0.1320459097623825,
|
| 448 |
+
"Connector": 0.09498800337314606,
|
| 449 |
+
"Best practices": 0.08815968036651611,
|
| 450 |
+
"How-to": 0.08476880192756653,
|
| 451 |
+
"SSO": 0.055265337228775024,
|
| 452 |
+
"Lineage": 0.05023786425590515,
|
| 453 |
+
"Glossary": 0.03815902769565582,
|
| 454 |
+
"API/SDK": 0.029956845566630363
|
| 455 |
+
},
|
| 456 |
+
"sentiment": "Angry",
|
| 457 |
+
"priority": "P1"
|
| 458 |
+
}
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"id": "TICKET-264",
|
| 462 |
+
"subject": "Accessing audit logs for compliance reviews",
|
| 463 |
+
"body": "Our compliance team needs to perform a quarterly review of all activities within Atlan. They need to know who accessed what data, who made permission changes, etc. Where can we find these audit logs, and is there a way to export them or pull them via an API for our records?",
|
| 464 |
+
"classification": {
|
| 465 |
+
"id": "TICKET-264",
|
| 466 |
+
"topic_tags": [
|
| 467 |
+
"How-to"
|
| 468 |
+
],
|
| 469 |
+
"topic_scores": {
|
| 470 |
+
"How-to": 0.3180291950702667,
|
| 471 |
+
"Sensitive data": 0.15853844583034515,
|
| 472 |
+
"Best practices": 0.10388915240764618,
|
| 473 |
+
"Glossary": 0.1034572422504425,
|
| 474 |
+
"Product": 0.08544151484966278,
|
| 475 |
+
"Connector": 0.07629270106554031,
|
| 476 |
+
"API/SDK": 0.07074188441038132,
|
| 477 |
+
"SSO": 0.04728706181049347,
|
| 478 |
+
"Lineage": 0.03632282093167305
|
| 479 |
+
},
|
| 480 |
+
"sentiment": "Angry",
|
| 481 |
+
"priority": "P1"
|
| 482 |
+
}
|
| 483 |
+
},
|
| 484 |
+
{
|
| 485 |
+
"id": "TICKET-265",
|
| 486 |
+
"subject": "How to programmatically create an asset using the REST API?",
|
| 487 |
+
"body": "I'm trying to create a new custom asset (a 'Report') using the REST API, but my requests keep failing with a 400 error. The API documentation is a bit sparse on the required payload structure for creating new entities. Could you provide a basic cURL or Python `requests` example of what a successful request body should look like?",
|
| 488 |
+
"classification": {
|
| 489 |
+
"id": "TICKET-265",
|
| 490 |
+
"topic_tags": [
|
| 491 |
+
"How-to"
|
| 492 |
+
],
|
| 493 |
+
"topic_scores": {
|
| 494 |
+
"How-to": 0.7019529938697815,
|
| 495 |
+
"Product": 0.07501153647899628,
|
| 496 |
+
"Sensitive data": 0.05422753840684891,
|
| 497 |
+
"API/SDK": 0.04047072306275368,
|
| 498 |
+
"Best practices": 0.03833013400435448,
|
| 499 |
+
"Glossary": 0.029218755662441254,
|
| 500 |
+
"Connector": 0.022240931168198586,
|
| 501 |
+
"SSO": 0.02166176587343216,
|
| 502 |
+
"Lineage": 0.01688558980822563
|
| 503 |
+
},
|
| 504 |
+
"sentiment": "Angry",
|
| 505 |
+
"priority": "P1"
|
| 506 |
+
}
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"id": "TICKET-266",
|
| 510 |
+
"subject": "SDK availability and Python example",
|
| 511 |
+
"body": "I'm a data engineer and prefer using SDKs over raw API calls. Which languages do you provide SDKs for? I'm particularly interested in Python. Where can I find the installation instructions (e.g., PyPI package name) and a short code snippet for a common task, like creating a new glossary term?",
|
| 512 |
+
"classification": {
|
| 513 |
+
"id": "TICKET-266",
|
| 514 |
+
"topic_tags": [
|
| 515 |
+
"API/SDK"
|
| 516 |
+
],
|
| 517 |
+
"topic_scores": {
|
| 518 |
+
"API/SDK": 0.5466265678405762,
|
| 519 |
+
"Glossary": 0.11553136259317398,
|
| 520 |
+
"How-to": 0.08963733166456223,
|
| 521 |
+
"Product": 0.06004762277007103,
|
| 522 |
+
"Sensitive data": 0.05992572009563446,
|
| 523 |
+
"Best practices": 0.056482475250959396,
|
| 524 |
+
"SSO": 0.032319601625204086,
|
| 525 |
+
"Connector": 0.020940439775586128,
|
| 526 |
+
"Lineage": 0.018488900735974312
|
| 527 |
+
},
|
| 528 |
+
"sentiment": "Angry",
|
| 529 |
+
"priority": "P2"
|
| 530 |
+
}
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"id": "TICKET-267",
|
| 534 |
+
"subject": "How do webhooks work in Atlan?",
|
| 535 |
+
"body": "I'm exploring using webhooks to send real-time notifications from Atlan to our internal Slack channel. I need to understand what types of events (e.g., asset updated, term created) can trigger a webhook. Also, how do we validate that the incoming payloads are genuinely from Atlan? Do you support payload signing?",
|
| 536 |
+
"classification": {
|
| 537 |
+
"id": "TICKET-267",
|
| 538 |
+
"topic_tags": [
|
| 539 |
+
"How-to"
|
| 540 |
+
],
|
| 541 |
+
"topic_scores": {
|
| 542 |
+
"How-to": 0.6155452728271484,
|
| 543 |
+
"Product": 0.0822950229048729,
|
| 544 |
+
"Sensitive data": 0.07453867048025131,
|
| 545 |
+
"Connector": 0.04730972647666931,
|
| 546 |
+
"SSO": 0.04478367045521736,
|
| 547 |
+
"Best practices": 0.04298403859138489,
|
| 548 |
+
"Glossary": 0.04120447859168053,
|
| 549 |
+
"API/SDK": 0.03110373578965664,
|
| 550 |
+
"Lineage": 0.0202353373169899
|
| 551 |
+
},
|
| 552 |
+
"sentiment": "Angry",
|
| 553 |
+
"priority": "P1"
|
| 554 |
+
}
|
| 555 |
+
},
|
| 556 |
+
{
|
| 557 |
+
"id": "TICKET-268",
|
| 558 |
+
"subject": "Triggering an AWS Lambda from Atlan events",
|
| 559 |
+
"body": "We have a workflow where we want to trigger a custom AWS Lambda function whenever a specific Atlan tag (e.g., 'PII-Confirmed') is added to an asset. What is the recommended and most secure way to set this up? Should we use webhooks pointing to an API Gateway, or is there a more direct integration?",
|
| 560 |
+
"classification": {
|
| 561 |
+
"id": "TICKET-268",
|
| 562 |
+
"topic_tags": [
|
| 563 |
+
"Sensitive data"
|
| 564 |
+
],
|
| 565 |
+
"topic_scores": {
|
| 566 |
+
"Sensitive data": 0.6416335701942444,
|
| 567 |
+
"How-to": 0.07063692808151245,
|
| 568 |
+
"Product": 0.06787507981061935,
|
| 569 |
+
"Best practices": 0.05057653412222862,
|
| 570 |
+
"Connector": 0.04566250368952751,
|
| 571 |
+
"SSO": 0.04065138101577759,
|
| 572 |
+
"Glossary": 0.034141793847084045,
|
| 573 |
+
"API/SDK": 0.0318898931145668,
|
| 574 |
+
"Lineage": 0.016932277008891106
|
| 575 |
+
},
|
| 576 |
+
"sentiment": "Angry",
|
| 577 |
+
"priority": "P2"
|
| 578 |
+
}
|
| 579 |
+
},
|
| 580 |
+
{
|
| 581 |
+
"id": "TICKET-269",
|
| 582 |
+
"subject": "When to use Atlan automations vs. external services?",
|
| 583 |
+
"body": "I see that Atlan has a built-in 'Automations' feature. I'm trying to decide if I should use this to manage a workflow or if I should use an external service like Zapier or our own Airflow instance. Could you provide some guidance or examples on what types of workflows are best suited for the native automations versus an external tool?",
|
| 584 |
+
"classification": {
|
| 585 |
+
"id": "TICKET-269",
|
| 586 |
+
"topic_tags": [
|
| 587 |
+
"How-to"
|
| 588 |
+
],
|
| 589 |
+
"topic_scores": {
|
| 590 |
+
"How-to": 0.31247150897979736,
|
| 591 |
+
"Product": 0.1856827437877655,
|
| 592 |
+
"Best practices": 0.12680679559707642,
|
| 593 |
+
"Sensitive data": 0.11848830431699753,
|
| 594 |
+
"Glossary": 0.0666823759675026,
|
| 595 |
+
"SSO": 0.059052322059869766,
|
| 596 |
+
"API/SDK": 0.05207480862736702,
|
| 597 |
+
"Connector": 0.0423959419131279,
|
| 598 |
+
"Lineage": 0.03634531795978546
|
| 599 |
+
},
|
| 600 |
+
"sentiment": "Angry",
|
| 601 |
+
"priority": "P2"
|
| 602 |
+
}
|
| 603 |
+
},
|
| 604 |
+
{
|
| 605 |
+
"id": "TICKET-270",
|
| 606 |
+
"subject": "Connector failed to crawl - where to check logs?",
|
| 607 |
+
"body": "URGENT: Our nightly Snowflake crawler failed last night and no new metadata was ingested. This is a critical failure as our morning reports are now missing lineage information. Where can I find the detailed error logs for the crawler run to understand what went wrong? I need to fix this ASAP.",
|
| 608 |
+
"classification": {
|
| 609 |
+
"id": "TICKET-270",
|
| 610 |
+
"topic_tags": [
|
| 611 |
+
"Connector"
|
| 612 |
+
],
|
| 613 |
+
"topic_scores": {
|
| 614 |
+
"Connector": 0.7956897616386414,
|
| 615 |
+
"Lineage": 0.10316770523786545,
|
| 616 |
+
"Sensitive data": 0.036917030811309814,
|
| 617 |
+
"Product": 0.021942023187875748,
|
| 618 |
+
"SSO": 0.012663195841014385,
|
| 619 |
+
"How-to": 0.009871531277894974,
|
| 620 |
+
"Glossary": 0.008735943585634232,
|
| 621 |
+
"Best practices": 0.005630514584481716,
|
| 622 |
+
"API/SDK": 0.005382323171943426
|
| 623 |
+
},
|
| 624 |
+
"sentiment": "Angry",
|
| 625 |
+
"priority": "P0"
|
| 626 |
+
}
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"id": "TICKET-271",
|
| 630 |
+
"subject": "Asset extracted but not published to Atlan",
|
| 631 |
+
"body": "This is very strange. I'm looking at the crawler logs, and I can see that the asset 'schema.my_table' was successfully extracted from the source. However, when I search for this table in the Atlan UI, it doesn't appear. It seems like it's getting stuck somewhere between extraction and publishing. Can you please investigate the root cause?",
|
| 632 |
+
"classification": {
|
| 633 |
+
"id": "TICKET-271",
|
| 634 |
+
"topic_tags": [
|
| 635 |
+
"Product"
|
| 636 |
+
],
|
| 637 |
+
"topic_scores": {
|
| 638 |
+
"Product": 0.3031204640865326,
|
| 639 |
+
"Sensitive data": 0.1932554543018341,
|
| 640 |
+
"Best practices": 0.1285562515258789,
|
| 641 |
+
"SSO": 0.08312994241714478,
|
| 642 |
+
"Connector": 0.07090307772159576,
|
| 643 |
+
"How-to": 0.0644526332616806,
|
| 644 |
+
"Lineage": 0.06026022136211395,
|
| 645 |
+
"API/SDK": 0.05608906224370003,
|
| 646 |
+
"Glossary": 0.04023294523358345
|
| 647 |
+
},
|
| 648 |
+
"sentiment": "Angry",
|
| 649 |
+
"priority": "P2"
|
| 650 |
+
}
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"id": "TICKET-272",
|
| 654 |
+
"subject": "How to measure adoption and generate reports?",
|
| 655 |
+
"body": "My manager is asking for metrics on our Atlan usage to justify the investment. I need to generate a report showing things like the number of active users, most frequently queried tables, and the number of assets with assigned owners. Does Atlan have a reporting or dashboarding feature for this?",
|
| 656 |
+
"classification": {
|
| 657 |
+
"id": "TICKET-272",
|
| 658 |
+
"topic_tags": [
|
| 659 |
+
"How-to"
|
| 660 |
+
],
|
| 661 |
+
"topic_scores": {
|
| 662 |
+
"How-to": 0.7723440527915955,
|
| 663 |
+
"Product": 0.057156000286340714,
|
| 664 |
+
"Sensitive data": 0.04286324605345726,
|
| 665 |
+
"Best practices": 0.03300917148590088,
|
| 666 |
+
"Connector": 0.02577182836830616,
|
| 667 |
+
"Glossary": 0.020914847031235695,
|
| 668 |
+
"SSO": 0.019454676657915115,
|
| 669 |
+
"API/SDK": 0.014868470840156078,
|
| 670 |
+
"Lineage": 0.013617790304124355
|
| 671 |
+
},
|
| 672 |
+
"sentiment": "Angry",
|
| 673 |
+
"priority": "P1"
|
| 674 |
+
}
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"id": "TICKET-273",
|
| 678 |
+
"subject": "Best practices for catalog hygiene",
|
| 679 |
+
"body": "We've been using Atlan for six months, and our catalog is already starting to get a bit messy with duplicate assets and stale metadata from old tests. As we roll this out to more teams, what are some common best practices or features within Atlan that can help us maintain good catalog hygiene and prevent this problem from getting worse?",
|
| 680 |
+
"classification": {
|
| 681 |
+
"id": "TICKET-273",
|
| 682 |
+
"topic_tags": [
|
| 683 |
+
"Best practices"
|
| 684 |
+
],
|
| 685 |
+
"topic_scores": {
|
| 686 |
+
"Best practices": 0.8946393728256226,
|
| 687 |
+
"Product": 0.03143538162112236,
|
| 688 |
+
"How-to": 0.0247127003967762,
|
| 689 |
+
"Sensitive data": 0.014560189098119736,
|
| 690 |
+
"Glossary": 0.009846128523349762,
|
| 691 |
+
"Connector": 0.007913430221378803,
|
| 692 |
+
"Lineage": 0.007438138592988253,
|
| 693 |
+
"SSO": 0.006103003863245249,
|
| 694 |
+
"API/SDK": 0.0033516811672598124
|
| 695 |
+
},
|
| 696 |
+
"sentiment": "Angry",
|
| 697 |
+
"priority": "P2"
|
| 698 |
+
}
|
| 699 |
+
},
|
| 700 |
+
{
|
| 701 |
+
"id": "TICKET-274",
|
| 702 |
+
"subject": "How to scale Atlan across multiple business units?",
|
| 703 |
+
"body": "We are planning a global rollout of Atlan to multiple business units, each with its own data sources and governance teams. We're looking for advice on the best way to structure our Atlan instance. Should we use separate workspaces, or can we achieve isolation using teams and permissions within a single workspace while maintaining a consistent governance model?",
|
| 704 |
+
"classification": {
|
| 705 |
+
"id": "TICKET-274",
|
| 706 |
+
"topic_tags": [
|
| 707 |
+
"How-to"
|
| 708 |
+
],
|
| 709 |
+
"topic_scores": {
|
| 710 |
+
"How-to": 0.5874260067939758,
|
| 711 |
+
"Best practices": 0.10692328214645386,
|
| 712 |
+
"Product": 0.09862891584634781,
|
| 713 |
+
"Sensitive data": 0.0611591637134552,
|
| 714 |
+
"Glossary": 0.038298100233078,
|
| 715 |
+
"SSO": 0.037935610860586166,
|
| 716 |
+
"Connector": 0.029464809224009514,
|
| 717 |
+
"Lineage": 0.023105787113308907,
|
| 718 |
+
"API/SDK": 0.017058314755558968
|
| 719 |
+
},
|
| 720 |
+
"sentiment": "Angry",
|
| 721 |
+
"priority": "P2"
|
| 722 |
+
}
|
| 723 |
+
}
|
| 724 |
+
]
|
docs_corpus.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
docs_meta.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
faiss_index.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb0b698008aae120d30504c1784f0797dc800066b27d63740c079b2466549d8c
|
| 3 |
+
size 18026541
|
sample_tickets.json
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"id": "TICKET-245",
|
| 4 |
+
"subject": "Connecting Snowflake to Atlan - required permissions?",
|
| 5 |
+
"body": "Hi team, we're trying to set up our primary Snowflake production database as a new source in Atlan, but the connection keeps failing. We've tried using our standard service account, but it's not working. Our entire BI team is blocked on this integration for a major upcoming project, so it's quite urgent. Could you please provide a definitive list of the exact permissions and credentials needed on the Snowflake side to get this working? Thanks."
|
| 6 |
+
},
|
| 7 |
+
{
|
| 8 |
+
"id": "TICKET-246",
|
| 9 |
+
"subject": "Which connectors automatically capture lineage?",
|
| 10 |
+
"body": "Hello, I'm new to Atlan and trying to understand the lineage capabilities. The documentation mentions automatic lineage, but it's not clear which of our connectors (we use Fivetran, dbt, and Tableau) support this out-of-the-box. We need to present a clear picture of our data flow to leadership next week. Can you explain how lineage capture differs for these tools?"
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"id": "TICKET-247",
|
| 14 |
+
"subject": "Deployment of Atlan agent for private data lake",
|
| 15 |
+
"body": "Our primary data lake is hosted on-premise within a secure VPC and is not exposed to the internet. We understand we need to use the Atlan agent for this, but the setup instructions are a bit confusing for our security team. This is a critical source for us, and we can't proceed with our rollout until we get this connected. Can you provide a detailed deployment guide or connect us with a technical expert?"
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"id": "TICKET-248",
|
| 19 |
+
"subject": "How to surface sample rows and schema changes?",
|
| 20 |
+
"body": "Hi, we've successfully connected our Redshift cluster, and the assets are showing up. However, my data analysts are asking how they can see sample data or recent schema changes directly within Atlan without having to go back to Redshift. Is this feature available? I feel like I'm missing something obvious."
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"id": "TICKET-249",
|
| 24 |
+
"subject": "Exporting lineage view for a specific table",
|
| 25 |
+
"body": "For our quarterly audit, I need to provide a complete upstream and downstream lineage diagram for our core `fact_orders` table. I can see the lineage perfectly in the UI, but I can't find an option to export this view as an image or PDF. This is a hard requirement from our compliance team and the deadline is approaching fast. Please help!"
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"id": "TICKET-250",
|
| 29 |
+
"subject": "Importing lineage from Airflow jobs",
|
| 30 |
+
"body": "We run hundreds of ETL jobs in Airflow, and we need to see that lineage reflected in Atlan. I've read that Atlan can integrate with Airflow, but how do we configure it to correctly map our DAGs to the specific datasets they are transforming? The current documentation is a bit high-level."
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"id": "TICKET-251",
|
| 34 |
+
"subject": "Using the Visual Query Builder",
|
| 35 |
+
"body": "I'm a business analyst and not very comfortable with writing complex SQL. I was excited to see the Visual Query Builder in Atlan, but I'm having trouble figuring out how to join multiple tables and save my query for later use. Is there a tutorial or a quick guide you can point me to?"
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"id": "TICKET-252",
|
| 39 |
+
"subject": "Programmatic extraction of lineage",
|
| 40 |
+
"body": "Our internal data science team wants to build a custom application that analyzes metadata propagation delays. To do this, we need to programmatically extract lineage data from Atlan via an API. Does the API expose lineage information, and if so, could you provide an example of the endpoint and the structure of the response?"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"id": "TICKET-253",
|
| 44 |
+
"subject": "Upstream lineage to Snowflake view not working",
|
| 45 |
+
"body": "This is infuriating. We have a critical Snowflake view, `finance.daily_revenue`, that is built from three upstream tables. Atlan is correctly showing the downstream dependencies, but the upstream lineage is completely missing. This makes the view untrustworthy for our analysts. We've re-run the crawler multiple times. What could be causing this? This is a huge problem for us."
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"id": "TICKET-254",
|
| 49 |
+
"subject": "How to create a business glossary and link terms in bulk?",
|
| 50 |
+
"body": "We are migrating our existing business glossary from a spreadsheet into Atlan. We have over 500 terms. Manually creating each one and linking them to thousands of assets seems impossible. Is there a bulk import feature using CSV or an API to create terms and link them to assets? This is blocking our entire governance initiative."
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"id": "TICKET-255",
|
| 54 |
+
"subject": "Creating a custom role for data stewards",
|
| 55 |
+
"body": "I'm trying to set up a custom role for our data stewards. They need permission to edit descriptions and link glossary terms, but they should NOT have permission to run queries or change connection settings. I'm looking at the default roles, but none of them fit perfectly. How can I create a new role with this specific set of permissions?"
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"id": "TICKET-256",
|
| 59 |
+
"subject": "Mapping Active Directory groups to Atlan teams",
|
| 60 |
+
"body": "Our company policy requires us to manage all user access through Active Directory groups. We need to map our existing AD groups (e.g., 'data-analyst-finance', 'data-engineer-core') to teams within Atlan to automatically grant the correct permissions. I can't find the settings for this. How is this configured?"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"id": "TICKET-257",
|
| 64 |
+
"subject": "RBAC for assets vs. glossaries",
|
| 65 |
+
"body": "I need clarification on how Atlan's role-based access control works. If a user is denied access to a specific Snowflake schema, can they still see the glossary terms that are linked to the tables in that schema? I need to ensure our PII governance is airtight."
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"id": "TICKET-258",
|
| 69 |
+
"subject": "Process for onboarding asset owners",
|
| 70 |
+
"body": "We've started identifying owners for our key data assets. What is the recommended workflow in Atlan to assign these owners and automatically notify them? We want to make sure they are aware of their responsibilities without us having to send manual emails for every assignment."
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"id": "TICKET-259",
|
| 74 |
+
"subject": "How does Atlan surface sensitive fields like PII?",
|
| 75 |
+
"body": "Our security team is evaluating Atlan and their main question is around PII and sensitive data. How does Atlan automatically identify fields containing PII? What are our options to apply tags or masks to these fields once they are identified to prevent unauthorized access?"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"id": "TICKET-260",
|
| 79 |
+
"subject": "Authentication methods for APIs and SDKs",
|
| 80 |
+
"body": "We are planning to build several automations using the Atlan API and Python SDK. What authentication methods are supported? Is it just API keys, or can we use something like OAuth? We have a strict policy that requires key rotation every 90 days, so we need to understand how to manage this programmatically."
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"id": "TICKET-261",
|
| 84 |
+
"subject": "Enabling and testing SAML SSO",
|
| 85 |
+
"body": "We are ready to enable SAML SSO with our Okta instance. However, we are very concerned about disrupting our active users if the configuration is wrong. Is there a way to test the SSO configuration for a specific user or group before we enable it for the entire workspace?"
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"id": "TICKET-262",
|
| 89 |
+
"subject": "SSO login not assigning user to correct group",
|
| 90 |
+
"body": "I've just had a new user, 'test.user@company.com', log in via our newly configured SSO. They were authenticated successfully, but they were not added to the 'Data Analysts' group as expected based on our SAML assertions. This is preventing them from accessing any assets. What could be the reason for this mis-assignment?"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"id": "TICKET-263",
|
| 94 |
+
"subject": "Integration with existing DLP or secrets manager",
|
| 95 |
+
"body": "Does Atlan have the capability to integrate with third-party tools like a DLP (Data Loss Prevention) solution or a secrets manager like HashiCorp Vault? We need to ensure that connection credentials and sensitive metadata classifications are handled by our central security systems."
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"id": "TICKET-264",
|
| 99 |
+
"subject": "Accessing audit logs for compliance reviews",
|
| 100 |
+
"body": "Our compliance team needs to perform a quarterly review of all activities within Atlan. They need to know who accessed what data, who made permission changes, etc. Where can we find these audit logs, and is there a way to export them or pull them via an API for our records?"
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"id": "TICKET-265",
|
| 104 |
+
"subject": "How to programmatically create an asset using the REST API?",
|
| 105 |
+
"body": "I'm trying to create a new custom asset (a 'Report') using the REST API, but my requests keep failing with a 400 error. The API documentation is a bit sparse on the required payload structure for creating new entities. Could you provide a basic cURL or Python `requests` example of what a successful request body should look like?"
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"id": "TICKET-266",
|
| 109 |
+
"subject": "SDK availability and Python example",
|
| 110 |
+
"body": "I'm a data engineer and prefer using SDKs over raw API calls. Which languages do you provide SDKs for? I'm particularly interested in Python. Where can I find the installation instructions (e.g., PyPI package name) and a short code snippet for a common task, like creating a new glossary term?"
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"id": "TICKET-267",
|
| 114 |
+
"subject": "How do webhooks work in Atlan?",
|
| 115 |
+
"body": "I'm exploring using webhooks to send real-time notifications from Atlan to our internal Slack channel. I need to understand what types of events (e.g., asset updated, term created) can trigger a webhook. Also, how do we validate that the incoming payloads are genuinely from Atlan? Do you support payload signing?"
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"id": "TICKET-268",
|
| 119 |
+
"subject": "Triggering an AWS Lambda from Atlan events",
|
| 120 |
+
"body": "We have a workflow where we want to trigger a custom AWS Lambda function whenever a specific Atlan tag (e.g., 'PII-Confirmed') is added to an asset. What is the recommended and most secure way to set this up? Should we use webhooks pointing to an API Gateway, or is there a more direct integration?"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"id": "TICKET-269",
|
| 124 |
+
"subject": "When to use Atlan automations vs. external services?",
|
| 125 |
+
"body": "I see that Atlan has a built-in 'Automations' feature. I'm trying to decide if I should use this to manage a workflow or if I should use an external service like Zapier or our own Airflow instance. Could you provide some guidance or examples on what types of workflows are best suited for the native automations versus an external tool?"
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"id": "TICKET-270",
|
| 129 |
+
"subject": "Connector failed to crawl - where to check logs?",
|
| 130 |
+
"body": "URGENT: Our nightly Snowflake crawler failed last night and no new metadata was ingested. This is a critical failure as our morning reports are now missing lineage information. Where can I find the detailed error logs for the crawler run to understand what went wrong? I need to fix this ASAP."
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"id": "TICKET-271",
|
| 134 |
+
"subject": "Asset extracted but not published to Atlan",
|
| 135 |
+
"body": "This is very strange. I'm looking at the crawler logs, and I can see that the asset 'schema.my_table' was successfully extracted from the source. However, when I search for this table in the Atlan UI, it doesn't appear. It seems like it's getting stuck somewhere between extraction and publishing. Can you please investigate the root cause?"
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"id": "TICKET-272",
|
| 139 |
+
"subject": "How to measure adoption and generate reports?",
|
| 140 |
+
"body": "My manager is asking for metrics on our Atlan usage to justify the investment. I need to generate a report showing things like the number of active users, most frequently queried tables, and the number of assets with assigned owners. Does Atlan have a reporting or dashboarding feature for this?"
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"id": "TICKET-273",
|
| 144 |
+
"subject": "Best practices for catalog hygiene",
|
| 145 |
+
"body": "We've been using Atlan for six months, and our catalog is already starting to get a bit messy with duplicate assets and stale metadata from old tests. As we roll this out to more teams, what are some common best practices or features within Atlan that can help us maintain good catalog hygiene and prevent this problem from getting worse?"
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"id": "TICKET-274",
|
| 149 |
+
"subject": "How to scale Atlan across multiple business units?",
|
| 150 |
+
"body": "We are planning a global rollout of Atlan to multiple business units, each with its own data sources and governance teams. We're looking for advice on the best way to structure our Atlan instance. Should we use separate workspaces, or can we achieve isolation using teams and permissions within a single workspace while maintaining a consistent governance model?"
|
| 151 |
+
}
|
| 152 |
+
]
|
src/__pycache__/classifier.cpython-313.pyc
ADDED
|
Binary file (6.34 kB). View file
|
|
|
src/__pycache__/data_loader.cpython-313.pyc
ADDED
|
Binary file (1.15 kB). View file
|
|
|
src/__pycache__/rag.cpython-313.pyc
ADDED
|
Binary file (20.1 kB). View file
|
|
|
src/app.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/app.py
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from data_loader import load_tickets
|
| 7 |
+
from classifier import classify_ticket, classify_all_and_save
|
| 8 |
+
|
| 9 |
+
# NEW: import RAG handler
|
| 10 |
+
try:
|
| 11 |
+
from rag import handle_rag_query
|
| 12 |
+
except Exception:
|
| 13 |
+
handle_rag_query = None # we'll handle absence gracefully
|
| 14 |
+
|
| 15 |
+
# Config
|
| 16 |
+
st.set_page_config(page_title="Atlan - Support Copilot (Phase 3)", layout="wide")
|
| 17 |
+
ROOT = Path(__file__).parent.parent.resolve() # project root
|
| 18 |
+
CLASSIFIED_PATH = ROOT.joinpath("classified_tickets_phase2.json")
|
| 19 |
+
|
| 20 |
+
st.title("Atlan — Support Copilot (Phase 3)")
|
| 21 |
+
st.markdown(
|
| 22 |
+
"**Phase 3:** Zero-shot topic classification + HF sentiment + rule-based priority + RAG (retrieval-augmented generation). "
|
| 23 |
+
"This demo shows bulk classification and an interactive agent with RAG."
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Sidebar controls
|
| 27 |
+
st.sidebar.header("Controls")
|
| 28 |
+
use_saved = st.sidebar.checkbox("Load pre-saved classified file (if available)", value=True)
|
| 29 |
+
run_classify_all = st.sidebar.button("Classify ALL tickets & Save (Phase 2)")
|
| 30 |
+
reload_ui = st.sidebar.button("Reload UI")
|
| 31 |
+
|
| 32 |
+
# NEW: RAG options in sidebar
|
| 33 |
+
st.sidebar.markdown("### RAG options")
|
| 34 |
+
use_openai = st.sidebar.checkbox("Use OpenAI for generation (if API key set)", value=False)
|
| 35 |
+
top_k = st.sidebar.slider("RAG: number of passages to retrieve", min_value=1, max_value=10, value=5)
|
| 36 |
+
|
| 37 |
+
# Safe reload: attempt API call, otherwise show instruction to refresh
|
| 38 |
+
if reload_ui:
|
| 39 |
+
try:
|
| 40 |
+
st.experimental_rerun()
|
| 41 |
+
except Exception:
|
| 42 |
+
st.info("Automatic reload isn't supported by this Streamlit version. Please refresh the browser page to reload the UI.")
|
| 43 |
+
|
| 44 |
+
# Load tickets (original) — call the loader without forcing a path so it uses its default
|
| 45 |
+
try:
|
| 46 |
+
tickets = load_tickets() # loader default = ../sample_tickets.json (project root)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
st.error("Could not load sample tickets. Ensure sample_tickets.json exists at the project root (one level above src/).")
|
| 49 |
+
st.exception(e)
|
| 50 |
+
tickets = []
|
| 51 |
+
|
| 52 |
+
# If user asked to classify all, run classification and save (uses classifier defaults)
|
| 53 |
+
if run_classify_all:
|
| 54 |
+
with st.spinner("Running classification on all tickets (models may load on first run)..."):
|
| 55 |
+
try:
|
| 56 |
+
out_path = classify_all_and_save() # defaults -> saves to ../classified_tickets_phase2.json
|
| 57 |
+
st.success(f"Classified and saved to: {out_path}")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
st.error("Error during batch classification. See details below.")
|
| 60 |
+
st.exception(e)
|
| 61 |
+
|
| 62 |
+
# Try to load pre-saved classified file (if requested and exists)
|
| 63 |
+
classified_data = None
|
| 64 |
+
if use_saved and CLASSIFIED_PATH.exists():
|
| 65 |
+
try:
|
| 66 |
+
classified_data = json.loads(CLASSIFIED_PATH.read_text(encoding="utf-8"))
|
| 67 |
+
except Exception as e:
|
| 68 |
+
st.warning("Could not read the saved classified file; falling back to live classification.")
|
| 69 |
+
st.exception(e)
|
| 70 |
+
|
| 71 |
+
tab1, tab2 = st.tabs(["Bulk Classification Dashboard", "Interactive Agent (demo + RAG)"])
|
| 72 |
+
|
| 73 |
+
with tab1:
|
| 74 |
+
st.header("Bulk ticket classification")
|
| 75 |
+
st.write("This view shows all tickets with their inferred topic tags, sentiment, and priority.")
|
| 76 |
+
|
| 77 |
+
rows = []
|
| 78 |
+
# If we have pre-classified data, use that (faster). Otherwise classify on the fly.
|
| 79 |
+
if classified_data:
|
| 80 |
+
for entry in classified_data:
|
| 81 |
+
c = entry.get("classification", {})
|
| 82 |
+
rows.append({
|
| 83 |
+
"id": entry.get("id"),
|
| 84 |
+
"subject": entry.get("subject"),
|
| 85 |
+
"topic_tags": ", ".join(c.get("topic_tags", [])),
|
| 86 |
+
"sentiment": c.get("sentiment", ""),
|
| 87 |
+
"priority": c.get("priority", ""),
|
| 88 |
+
})
|
| 89 |
+
else:
|
| 90 |
+
# Live classify (will call HF pipelines lazily)
|
| 91 |
+
with st.spinner("Classifying tickets (zero-shot)... this may take a few seconds on first run"):
|
| 92 |
+
for t in tickets:
|
| 93 |
+
try:
|
| 94 |
+
c = classify_ticket(t)
|
| 95 |
+
except Exception as e:
|
| 96 |
+
st.error(f"Error classifying ticket {t.get('id')}: {e}")
|
| 97 |
+
c = {"topic_tags": [], "sentiment": "Error", "priority": "Error"}
|
| 98 |
+
rows.append({
|
| 99 |
+
"id": t.get("id"),
|
| 100 |
+
"subject": t.get("subject"),
|
| 101 |
+
"topic_tags": ", ".join(c.get("topic_tags", [])),
|
| 102 |
+
"sentiment": c.get("sentiment", ""),
|
| 103 |
+
"priority": c.get("priority", ""),
|
| 104 |
+
})
|
| 105 |
+
|
| 106 |
+
df = pd.DataFrame(rows)
|
| 107 |
+
# basic filters
|
| 108 |
+
cols = st.columns([2, 1, 1, 1])
|
| 109 |
+
with cols[0]:
|
| 110 |
+
q = st.text_input("Filter by subject/text contains")
|
| 111 |
+
with cols[1]:
|
| 112 |
+
sel_topic = st.selectbox("Filter by topic (contains)", options=["(any)"] + sorted({t for row in rows for t in row["topic_tags"].split(", ") if t}))
|
| 113 |
+
with cols[2]:
|
| 114 |
+
sel_sent = st.selectbox("Filter by sentiment", options=["(any)","Angry","Frustrated","Neutral","Curious","Positive"])
|
| 115 |
+
with cols[3]:
|
| 116 |
+
sel_prio = st.selectbox("Filter by priority", options=["(any)","P0","P1","P2"])
|
| 117 |
+
|
| 118 |
+
df_display = df.copy()
|
| 119 |
+
if q:
|
| 120 |
+
df_display = df_display[df_display["subject"].str.contains(q, case=False, na=False) | df_display["topic_tags"].str.contains(q, case=False, na=False)]
|
| 121 |
+
if sel_topic and sel_topic != "(any)":
|
| 122 |
+
df_display = df_display[df_display["topic_tags"].str.contains(sel_topic, na=False)]
|
| 123 |
+
if sel_sent and sel_sent != "(any)":
|
| 124 |
+
df_display = df_display[df_display["sentiment"] == sel_sent]
|
| 125 |
+
if sel_prio and sel_prio != "(any)":
|
| 126 |
+
df_display = df_display[df_display["priority"] == sel_prio]
|
| 127 |
+
|
| 128 |
+
st.dataframe(df_display.reset_index(drop=True), use_container_width=True, height=420)
|
| 129 |
+
|
| 130 |
+
st.markdown("### Sample ticket detail")
|
| 131 |
+
# choose ticket
|
| 132 |
+
ids = df_display["id"].tolist()
|
| 133 |
+
if ids:
|
| 134 |
+
sel = st.selectbox("Select ticket", ids)
|
| 135 |
+
# find original ticket object (from classified_data if present else from tickets)
|
| 136 |
+
selected_full = None
|
| 137 |
+
if classified_data:
|
| 138 |
+
selected_full = next((x for x in classified_data if x["id"] == sel), None)
|
| 139 |
+
if not selected_full:
|
| 140 |
+
selected_full = next((x for x in tickets if x["id"] == sel), None)
|
| 141 |
+
|
| 142 |
+
st.write(selected_full)
|
| 143 |
+
st.markdown("**Classification (raw)**")
|
| 144 |
+
if selected_full and "classification" in selected_full:
|
| 145 |
+
st.json(selected_full["classification"])
|
| 146 |
+
else:
|
| 147 |
+
# classify on-the-fly for selected ticket if no classification exists
|
| 148 |
+
with st.spinner("Classifying selected ticket..."):
|
| 149 |
+
try:
|
| 150 |
+
c = classify_ticket(selected_full)
|
| 151 |
+
except Exception as e:
|
| 152 |
+
st.error("Error during classification of selected ticket.")
|
| 153 |
+
st.exception(e)
|
| 154 |
+
c = {}
|
| 155 |
+
st.json(c)
|
| 156 |
+
else:
|
| 157 |
+
st.info("No tickets to display with current filters.")
|
| 158 |
+
|
| 159 |
+
with tab2:
|
| 160 |
+
st.header("Interactive Agent (Phase 3 - analysis + RAG)")
|
| 161 |
+
st.markdown(
|
| 162 |
+
"Paste a ticket subject and body (or type). The backend analysis will show topic tags, sentiment and priority. "
|
| 163 |
+
"If the topic is one of the RAG-enabled categories (How-to, Product, Best practices, API/SDK, SSO), the app will run RAG and show a cited answer."
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
user_input = st.text_area("Paste a ticket subject + body (or type a new one)", height=220, placeholder="Subject line on first line, body below...")
|
| 167 |
+
analyze = st.button("Analyze input")
|
| 168 |
+
|
| 169 |
+
if analyze:
|
| 170 |
+
if not user_input.strip():
|
| 171 |
+
st.warning("Enter some ticket text to analyze.")
|
| 172 |
+
else:
|
| 173 |
+
# Infer subject/body: first line = subject
|
| 174 |
+
lines = user_input.strip().split("\n")
|
| 175 |
+
subject = lines[0]
|
| 176 |
+
body = "\n".join(lines[1:]).strip() if len(lines) > 1 else user_input.strip()
|
| 177 |
+
demo_ticket = {"id": "TEMP", "subject": subject, "body": body}
|
| 178 |
+
with st.spinner("Analyzing (zero-shot + sentiment)..."):
|
| 179 |
+
try:
|
| 180 |
+
c = classify_ticket(demo_ticket)
|
| 181 |
+
except Exception as e:
|
| 182 |
+
st.error("Error during classification.")
|
| 183 |
+
st.exception(e)
|
| 184 |
+
c = {"topic_tags": [], "sentiment": "Error", "priority": "Error"}
|
| 185 |
+
|
| 186 |
+
st.subheader("Internal analysis (backend view)")
|
| 187 |
+
st.json(c)
|
| 188 |
+
|
| 189 |
+
st.subheader("Final response (frontend view)")
|
| 190 |
+
# RAG-enabled topics
|
| 191 |
+
allowed_rag = {"How-to", "Product", "Best practices", "API/SDK", "SSO"}
|
| 192 |
+
|
| 193 |
+
# If ticket topic is RAG-enabled -> run RAG
|
| 194 |
+
if any(lbl in allowed_rag for lbl in c.get("topic_tags", [])):
|
| 195 |
+
if handle_rag_query is None:
|
| 196 |
+
st.error("RAG handler not found. Make sure src/rag.py exists and is importable.")
|
| 197 |
+
else:
|
| 198 |
+
st.info("RAG triggered — retrieving docs and generating an answer...")
|
| 199 |
+
with st.spinner("Retrieving + generating answer (may take a few seconds)..."):
|
| 200 |
+
# Use the combined subject+body as the query
|
| 201 |
+
query_text = f"{subject}\n\n{body}"
|
| 202 |
+
try:
|
| 203 |
+
rag_res = handle_rag_query(query_text, top_k=top_k, use_openai=use_openai)
|
| 204 |
+
except Exception as e:
|
| 205 |
+
st.error("Error during RAG operation.")
|
| 206 |
+
st.exception(e)
|
| 207 |
+
rag_res = {"answer": "RAG failed.", "sources": [], "retrieved": []}
|
| 208 |
+
|
| 209 |
+
st.subheader("Answer")
|
| 210 |
+
st.markdown(rag_res.get("answer", "No answer returned."))
|
| 211 |
+
|
| 212 |
+
st.subheader("Sources (citations)")
|
| 213 |
+
for s in rag_res.get("sources", []):
|
| 214 |
+
st.write(s)
|
| 215 |
+
|
| 216 |
+
st.subheader("Top retrieved passages (debug view)")
|
| 217 |
+
for r in rag_res.get("retrieved", [])[:top_k]:
|
| 218 |
+
st.markdown(f"**Title:** {r.get('title','(no title)')} \n**URL:** {r.get('url')} \n**Score:** {r.get('score'):.4f}")
|
| 219 |
+
st.write(r.get("text","")[:800] + ("..." if len(r.get("text","")) > 800 else ""))
|
| 220 |
+
|
| 221 |
+
else:
|
| 222 |
+
st.success(f"This ticket has been classified as {c.get('topic_tags', [])} and routed to the appropriate team.")
|
| 223 |
+
|
| 224 |
+
st.markdown("---")
|
| 225 |
+
st.caption(
|
| 226 |
+
"Phase 3 demo — zero-shot topic classification (facebook/bart-large-mnli), sentiment (distilbert SST-2), and RAG using local FAISS + sentence-transformers. "
|
| 227 |
+
"Toggle 'Use OpenAI' in the sidebar to use the OpenAI API for generation (requires OPENAI_API_KEY in env)."
|
| 228 |
+
)
|
src/classifier.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/classifier.py
|
| 2 |
+
from typing import Dict, List, Union
|
| 3 |
+
from transformers import pipeline
|
| 4 |
+
import math
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
# Lazy-loaded pipelines (module-level to reuse)
|
| 9 |
+
_zero_shot_clf = None
|
| 10 |
+
_sentiment_clf = None
|
| 11 |
+
|
| 12 |
+
def get_zero_shot_classifier():
|
| 13 |
+
global _zero_shot_clf
|
| 14 |
+
if _zero_shot_clf is None:
|
| 15 |
+
# BART or RoBERTa NLI models are common choices
|
| 16 |
+
_zero_shot_clf = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
| 17 |
+
return _zero_shot_clf
|
| 18 |
+
|
| 19 |
+
def get_sentiment_classifier():
|
| 20 |
+
global _sentiment_clf
|
| 21 |
+
if _sentiment_clf is None:
|
| 22 |
+
# SST-2 fine-tuned model
|
| 23 |
+
_sentiment_clf = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
|
| 24 |
+
return _sentiment_clf
|
| 25 |
+
|
| 26 |
+
# Schema - fixed topic labels requested by the assignment
|
| 27 |
+
TOPIC_LABELS = [
|
| 28 |
+
"How-to",
|
| 29 |
+
"Product",
|
| 30 |
+
"Connector",
|
| 31 |
+
"Lineage",
|
| 32 |
+
"API/SDK",
|
| 33 |
+
"SSO",
|
| 34 |
+
"Glossary",
|
| 35 |
+
"Best practices",
|
| 36 |
+
"Sensitive data"
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
# Optionally add synonyms/prompts to nudge zero-shot
|
| 40 |
+
LABEL_DESCRIPTIONS = {
|
| 41 |
+
"How-to": "user asking how to perform a task or request a tutorial",
|
| 42 |
+
"Product": "product feature, UI or general product question",
|
| 43 |
+
"Connector": "questions about connectors, crawlers, integrations and failures",
|
| 44 |
+
"Lineage": "questions about lineage, upstream/downstream or lineage exports",
|
| 45 |
+
"API/SDK": "developer questions about APIs, SDKs, endpoints, code examples",
|
| 46 |
+
"SSO": "authentication, SAML, SSO, Okta, login issues",
|
| 47 |
+
"Glossary": "business glossary, terms, bulk import of glossary terms",
|
| 48 |
+
"Best practices": "request for recommended approach, best practices or governance",
|
| 49 |
+
"Sensitive data": "questions about PII, masking, DLP, secrets"
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
def classify_topic_zero_shot(text: str, labels: List[str] = TOPIC_LABELS, hypothesis_template: str = "This text is about {}.") -> Dict:
|
| 53 |
+
"""
|
| 54 |
+
Returns a dictionary with labels and scores from zero-shot classifier.
|
| 55 |
+
"""
|
| 56 |
+
clf = get_zero_shot_classifier()
|
| 57 |
+
# The HF zero-shot pipeline can accept a 'hypothesis_template' to improve results.
|
| 58 |
+
res = clf(sequences=text, candidate_labels=labels, hypothesis_template=hypothesis_template)
|
| 59 |
+
# Example res: {'sequence':..., 'labels': [...], 'scores':[...]}
|
| 60 |
+
# We'll return top N labels above a threshold
|
| 61 |
+
return res
|
| 62 |
+
|
| 63 |
+
def classify_sentiment_hf(text: str) -> str:
|
| 64 |
+
"""
|
| 65 |
+
Returns a human-friendly sentiment label, mapping HF outputs to your schema.
|
| 66 |
+
HF model returns POSITIVE/NEGATIVE with a score.
|
| 67 |
+
We'll use a small mapping to Frustrated/Curious/Angry/Neutral/Positive.
|
| 68 |
+
"""
|
| 69 |
+
clf = get_sentiment_classifier()
|
| 70 |
+
out = clf(text[:1000]) # truncate long text for speed
|
| 71 |
+
# out like [{'label': 'NEGATIVE', 'score': 0.999}]
|
| 72 |
+
if not out:
|
| 73 |
+
return "Neutral"
|
| 74 |
+
lab = out[0]["label"].upper()
|
| 75 |
+
score = out[0]["score"]
|
| 76 |
+
# simple mapping
|
| 77 |
+
if lab == "NEGATIVE":
|
| 78 |
+
# distinguish angry vs frustrated by strength
|
| 79 |
+
if score > 0.9:
|
| 80 |
+
return "Angry"
|
| 81 |
+
return "Frustrated"
|
| 82 |
+
elif lab == "POSITIVE":
|
| 83 |
+
if score > 0.9:
|
| 84 |
+
return "Positive"
|
| 85 |
+
return "Curious"
|
| 86 |
+
else:
|
| 87 |
+
return "Neutral"
|
| 88 |
+
|
| 89 |
+
# Keep same rule-based priority function (deterministic SLA logic)
|
| 90 |
+
PRIORITY_KEYWORDS_P0 = ["urgent", "asap", "blocked", "blocker", "critical", "production", "failed", "failure", "infuriating", "can't", "cant", "down", "urgent:"]
|
| 91 |
+
PRIORITY_KEYWORDS_P1 = ["need", "important", "deadline", "next week", "approaching", "required", "soon", "high"]
|
| 92 |
+
|
| 93 |
+
def classify_priority(text: str, subject: str = "") -> str:
|
| 94 |
+
t = (subject + " " + text).lower()
|
| 95 |
+
for k in PRIORITY_KEYWORDS_P0:
|
| 96 |
+
if k in t:
|
| 97 |
+
return "P0"
|
| 98 |
+
for k in PRIORITY_KEYWORDS_P1:
|
| 99 |
+
if k in t:
|
| 100 |
+
return "P1"
|
| 101 |
+
return "P2"
|
| 102 |
+
|
| 103 |
+
def classify_ticket(ticket: Dict, top_k: int = 2, label_score_threshold: float = 0.25) -> Dict:
|
| 104 |
+
"""
|
| 105 |
+
Full classification of a single ticket:
|
| 106 |
+
- topic_tags: top_k labels from zero-shot (above threshold)
|
| 107 |
+
- sentiment: HF sentiment mapped
|
| 108 |
+
- priority: rule-based
|
| 109 |
+
"""
|
| 110 |
+
text = " ".join([ticket.get("subject", ""), ticket.get("body", "")])
|
| 111 |
+
z = classify_topic_zero_shot(text)
|
| 112 |
+
labels = z.get("labels", [])
|
| 113 |
+
scores = z.get("scores", [])
|
| 114 |
+
# Collect top_k labels above threshold
|
| 115 |
+
topic_tags = []
|
| 116 |
+
for lbl, score in zip(labels, scores):
|
| 117 |
+
if score >= label_score_threshold:
|
| 118 |
+
topic_tags.append(lbl)
|
| 119 |
+
if len(topic_tags) >= top_k:
|
| 120 |
+
break
|
| 121 |
+
# fallback: if nothing passes threshold, take the top label
|
| 122 |
+
if not topic_tags and labels:
|
| 123 |
+
topic_tags = [labels[0]]
|
| 124 |
+
|
| 125 |
+
sentiment = classify_sentiment_hf(text)
|
| 126 |
+
priority = classify_priority(ticket.get("body",""), ticket.get("subject",""))
|
| 127 |
+
|
| 128 |
+
return {
|
| 129 |
+
"id": ticket.get("id"),
|
| 130 |
+
"topic_tags": topic_tags,
|
| 131 |
+
"topic_scores": {lbl: float(s) for lbl, s in zip(labels, scores)},
|
| 132 |
+
"sentiment": sentiment,
|
| 133 |
+
"priority": priority
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
# batch classify and save JSON
|
| 137 |
+
def classify_all_and_save(input_path: Union[str, Path] = "../sample_tickets.json", output_path: Union[str, Path] = "../classified_tickets_phase2.json"):
|
| 138 |
+
p_in = Path(__file__).parent.joinpath(input_path).resolve()
|
| 139 |
+
p_out = Path(__file__).parent.joinpath(output_path).resolve()
|
| 140 |
+
tickets = json.loads(p_in.read_text(encoding="utf-8"))
|
| 141 |
+
results = []
|
| 142 |
+
for t in tickets:
|
| 143 |
+
c = classify_ticket(t)
|
| 144 |
+
results.append({**t, "classification": c})
|
| 145 |
+
p_out.write_text(json.dumps(results, indent=2), encoding="utf-8")
|
| 146 |
+
print(f"Saved {len(results)} classified tickets to {p_out}")
|
| 147 |
+
return p_out
|
| 148 |
+
|
| 149 |
+
if __name__ == "__main__":
|
| 150 |
+
classify_all_and_save()
|
src/data_loader.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
|
| 5 |
+
def load_tickets(path: str = "../sample_tickets.json") -> List[Dict]:
|
| 6 |
+
p = Path(__file__).parent.joinpath(path).resolve()
|
| 7 |
+
with open(p, "r", encoding="utf-8") as f:
|
| 8 |
+
tickets = json.load(f)
|
| 9 |
+
return tickets
|
| 10 |
+
|
| 11 |
+
if __name__ == "__main__":
|
| 12 |
+
tickets = load_tickets()
|
| 13 |
+
print(f"Loaded {len(tickets)} tickets")
|
| 14 |
+
# show first ticket
|
| 15 |
+
import pprint
|
| 16 |
+
pprint.pprint(tickets[0])
|
src/indexer.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/indexer.py
|
| 2 |
+
"""
|
| 3 |
+
Index the cleaned corpus into FAISS using sentence-level/small-chunk passages.
|
| 4 |
+
|
| 5 |
+
Outputs:
|
| 6 |
+
- faiss_index.bin (FAISS index)
|
| 7 |
+
- docs_meta.jsonl (one JSON line per vector with fields: id, url, title, text)
|
| 8 |
+
"""
|
| 9 |
+
from sentence_transformers import SentenceTransformer
|
| 10 |
+
import faiss
|
| 11 |
+
import ujson as json
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
+
import numpy as np
|
| 15 |
+
import re
|
| 16 |
+
|
| 17 |
+
CORPUS_PATH = Path(__file__).parent.parent.joinpath("docs_corpus.jsonl")
|
| 18 |
+
META_PATH = Path(__file__).parent.parent.joinpath("docs_meta.jsonl")
|
| 19 |
+
INDEX_PATH = Path(__file__).parent.parent.joinpath("faiss_index.bin")
|
| 20 |
+
EMBED_MODEL = "all-MiniLM-L6-v2"
|
| 21 |
+
|
| 22 |
+
# chunking by sentences, group up to N sentences per chunk (1-3 recommended)
|
| 23 |
+
MAX_SENTENCES_PER_CHUNK = 2
|
| 24 |
+
|
| 25 |
+
SENT_SPLIT_RE = re.compile(r'([.!?])\s+')
|
| 26 |
+
|
| 27 |
+
def split_into_sentences(text: str):
|
| 28 |
+
if not text:
|
| 29 |
+
return []
|
| 30 |
+
parts = SENT_SPLIT_RE.split(text)
|
| 31 |
+
sents = []
|
| 32 |
+
for i in range(0, len(parts), 2):
|
| 33 |
+
chunk = parts[i].strip()
|
| 34 |
+
punct = parts[i+1] if (i+1)<len(parts) else ""
|
| 35 |
+
sent = (chunk + punct).strip()
|
| 36 |
+
if sent:
|
| 37 |
+
sents.append(sent)
|
| 38 |
+
return sents
|
| 39 |
+
|
| 40 |
+
def build_index():
|
| 41 |
+
if not CORPUS_PATH.exists():
|
| 42 |
+
raise FileNotFoundError(f"Corpus not found at {CORPUS_PATH}. Run src/scrape_docs.py first.")
|
| 43 |
+
|
| 44 |
+
model = SentenceTransformer(EMBED_MODEL)
|
| 45 |
+
embeddings = []
|
| 46 |
+
meta = []
|
| 47 |
+
idx = 0
|
| 48 |
+
|
| 49 |
+
# read corpus and chunk into sentence groups
|
| 50 |
+
with CORPUS_PATH.open("r", encoding="utf-8") as f:
|
| 51 |
+
for line in tqdm(f, desc="Reading corpus"):
|
| 52 |
+
doc = json.loads(line)
|
| 53 |
+
url = doc.get("url")
|
| 54 |
+
title = doc.get("title","")
|
| 55 |
+
text = doc.get("text","")
|
| 56 |
+
sents = split_into_sentences(text)
|
| 57 |
+
if not sents:
|
| 58 |
+
continue
|
| 59 |
+
# group sentences into small chunks (1..MAX_SENTENCES_PER_CHUNK)
|
| 60 |
+
i = 0
|
| 61 |
+
while i < len(sents):
|
| 62 |
+
chunk_sents = sents[i:i+MAX_SENTENCES_PER_CHUNK]
|
| 63 |
+
chunk_text = " ".join(chunk_sents).strip()
|
| 64 |
+
if chunk_text:
|
| 65 |
+
meta.append({"id": idx, "url": url, "title": title, "text": chunk_text})
|
| 66 |
+
idx += 1
|
| 67 |
+
i += MAX_SENTENCES_PER_CHUNK
|
| 68 |
+
|
| 69 |
+
if not meta:
|
| 70 |
+
raise RuntimeError("No chunks created from corpus (empty corpus?)")
|
| 71 |
+
|
| 72 |
+
# encode in batches for memory efficiency
|
| 73 |
+
texts = [m["text"] for m in meta]
|
| 74 |
+
batch_size = 64
|
| 75 |
+
all_embs = []
|
| 76 |
+
for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
|
| 77 |
+
batch = texts[i:i+batch_size]
|
| 78 |
+
embs = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
|
| 79 |
+
all_embs.append(embs)
|
| 80 |
+
embeddings = np.vstack(all_embs).astype("float32")
|
| 81 |
+
|
| 82 |
+
# normalize to use inner-product as cosine
|
| 83 |
+
faiss.normalize_L2(embeddings)
|
| 84 |
+
d = embeddings.shape[1]
|
| 85 |
+
index = faiss.IndexFlatIP(d)
|
| 86 |
+
index.add(embeddings)
|
| 87 |
+
faiss.write_index(index, str(INDEX_PATH))
|
| 88 |
+
with META_PATH.open("w", encoding="utf-8") as f:
|
| 89 |
+
for m in meta:
|
| 90 |
+
f.write(json.dumps(m, ensure_ascii=False) + "\n")
|
| 91 |
+
print(f"Built index with {index.ntotal} vectors. Saved to {INDEX_PATH}, meta to {META_PATH}")
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
build_index()
|
src/rag.py
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/rag.py
|
| 2 |
+
"""
|
| 3 |
+
RAG module (updated)
|
| 4 |
+
- FAISS retrieval (sentence-transformers embeddings)
|
| 5 |
+
- Cross-encoder reranker (optional)
|
| 6 |
+
- Prompt template with sentence-aware snippet trimming
|
| 7 |
+
- Generation (OpenAI preferred) or local Flan-T5 fallback
|
| 8 |
+
- Post-processing: concise N sentences, no trailing "..." and no placeholders
|
| 9 |
+
- Deduplication / diversity of contexts by URL
|
| 10 |
+
- Procedural snippet handling (take next sentence if top is a header/list)
|
| 11 |
+
"""
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import ujson as json
|
| 14 |
+
import numpy as np
|
| 15 |
+
import textwrap
|
| 16 |
+
import os
|
| 17 |
+
import faiss
|
| 18 |
+
import re
|
| 19 |
+
|
| 20 |
+
# embeddings & models
|
| 21 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
import openai
|
| 25 |
+
except Exception:
|
| 26 |
+
openai = None
|
| 27 |
+
|
| 28 |
+
# local generator (transformers)
|
| 29 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
| 30 |
+
|
| 31 |
+
ROOT = Path(__file__).parent.parent.resolve()
|
| 32 |
+
INDEX_PATH = ROOT.joinpath("faiss_index.bin")
|
| 33 |
+
META_PATH = ROOT.joinpath("docs_meta.jsonl")
|
| 34 |
+
|
| 35 |
+
EMBED_MODEL = "all-MiniLM-L6-v2" # embeddings model (fast)
|
| 36 |
+
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2" # reranker
|
| 37 |
+
|
| 38 |
+
# local text generator model (CPU)
|
| 39 |
+
LOCAL_GEN_MODEL = "google/flan-t5-small"
|
| 40 |
+
|
| 41 |
+
# tune here: how many sentences to keep in final answer
|
| 42 |
+
MAX_ANSWER_SENTENCES = 2
|
| 43 |
+
|
| 44 |
+
# diversify: max chunks per url allowed in final top_candidates
|
| 45 |
+
MAX_CHUNKS_PER_URL = 2
|
| 46 |
+
|
| 47 |
+
# lazy-loaded resources
|
| 48 |
+
_index = None
|
| 49 |
+
_meta = None
|
| 50 |
+
_embed_model = None
|
| 51 |
+
_cross_encoder = None
|
| 52 |
+
_local_generator = None
|
| 53 |
+
|
| 54 |
+
# -------------------------
|
| 55 |
+
# Utilities: sentence helpers & tidy answer
|
| 56 |
+
# -------------------------
|
| 57 |
+
RE_SENT_SPLIT = re.compile(r'([.!?])\s+') # split and keep punctuation
|
| 58 |
+
RE_ANGLE_PLACEHOLDER = re.compile(r"<[^>]{1,200}>")
|
| 59 |
+
RE_DOUBLE_DASH_ID = re.compile(r"\b[a-zA-Z0-9_-]{3,}--\d{3,}\b")
|
| 60 |
+
RE_MULTI_DOTS = re.compile(r"\.{2,}")
|
| 61 |
+
RE_WHITESPACE = re.compile(r"\s+")
|
| 62 |
+
|
| 63 |
+
def split_into_sentences(text: str):
|
| 64 |
+
if not text:
|
| 65 |
+
return []
|
| 66 |
+
parts = RE_SENT_SPLIT.split(text)
|
| 67 |
+
sentences = []
|
| 68 |
+
for i in range(0, len(parts), 2):
|
| 69 |
+
chunk = parts[i].strip()
|
| 70 |
+
punct = parts[i+1] if (i+1)<len(parts) else ""
|
| 71 |
+
sentence = (chunk + punct).strip()
|
| 72 |
+
if sentence:
|
| 73 |
+
sentences.append(sentence)
|
| 74 |
+
return sentences
|
| 75 |
+
|
| 76 |
+
# -------------------------
|
| 77 |
+
# NEW: sentence-level extraction helpers with procedural handling
|
| 78 |
+
# -------------------------
|
| 79 |
+
def get_top_sentences_from_passage(passage_text: str, query: str, embed_model, top_n: int = 1):
|
| 80 |
+
"""
|
| 81 |
+
Given a passage text, split into sentences and return top_n sentences by cosine similarity.
|
| 82 |
+
For 'procedural' outputs where the top sentence is a header/menu (short or 'how to'), also
|
| 83 |
+
include the next sentence to provide actionable content.
|
| 84 |
+
"""
|
| 85 |
+
if not passage_text:
|
| 86 |
+
return []
|
| 87 |
+
sents = split_into_sentences(passage_text)
|
| 88 |
+
if not sents:
|
| 89 |
+
return []
|
| 90 |
+
if len(sents) <= top_n:
|
| 91 |
+
return sents[:top_n]
|
| 92 |
+
|
| 93 |
+
# embed query + sentences
|
| 94 |
+
q_emb = embed_model.encode([query], convert_to_numpy=True)
|
| 95 |
+
s_embs = embed_model.encode(sents, convert_to_numpy=True)
|
| 96 |
+
|
| 97 |
+
def norm(x):
|
| 98 |
+
n = np.linalg.norm(x)
|
| 99 |
+
return x / (n + 1e-10)
|
| 100 |
+
qn = norm(q_emb[0])
|
| 101 |
+
sims = [float(np.dot(qn, norm(se))) for se in s_embs]
|
| 102 |
+
idxs = sorted(range(len(sents)), key=lambda i: sims[i], reverse=True)[:top_n]
|
| 103 |
+
|
| 104 |
+
# if top sentence looks like a header/menu (very short, contains 'how to' or ends with ':'), also include the next sentence
|
| 105 |
+
out = []
|
| 106 |
+
for idx in idxs:
|
| 107 |
+
out.append(sents[idx])
|
| 108 |
+
# heuristic: if that sentence is short or contains "how to" or looks like heading, add next sentence if exists
|
| 109 |
+
s = sents[idx].lower()
|
| 110 |
+
word_count = len(s.split())
|
| 111 |
+
if (word_count <= 6 or 'how to' in s or s.endswith(':')) and (idx + 1) < len(sents):
|
| 112 |
+
out.append(sents[idx+1])
|
| 113 |
+
# dedupe while preserving order
|
| 114 |
+
seen = set()
|
| 115 |
+
final = []
|
| 116 |
+
for x in out:
|
| 117 |
+
if x not in seen:
|
| 118 |
+
final.append(x)
|
| 119 |
+
seen.add(x)
|
| 120 |
+
return final[:top_n]
|
| 121 |
+
|
| 122 |
+
def extract_fallback_from_contexts(contexts: list, query: str, n_sentences:int = 1) -> str:
|
| 123 |
+
"""
|
| 124 |
+
Deterministic fallback: find the single best sentence across contexts and return it verbatim.
|
| 125 |
+
"""
|
| 126 |
+
_, _, embed_model = load_index_and_meta()
|
| 127 |
+
best = None
|
| 128 |
+
best_score = -1.0
|
| 129 |
+
import numpy as np
|
| 130 |
+
q_emb = embed_model.encode([query], convert_to_numpy=True)[0]
|
| 131 |
+
def norm(x):
|
| 132 |
+
n=np.linalg.norm(x); return x/(n+1e-10)
|
| 133 |
+
qn = norm(q_emb)
|
| 134 |
+
|
| 135 |
+
for c in contexts:
|
| 136 |
+
sents = split_into_sentences(c.get("text",""))
|
| 137 |
+
if not sents:
|
| 138 |
+
continue
|
| 139 |
+
s_embs = embed_model.encode(sents, convert_to_numpy=True)
|
| 140 |
+
for s, se in zip(sents, s_embs):
|
| 141 |
+
sc = float(np.dot(qn, norm(se)))
|
| 142 |
+
if sc > best_score:
|
| 143 |
+
best_score = sc
|
| 144 |
+
best = s
|
| 145 |
+
if not best:
|
| 146 |
+
return ""
|
| 147 |
+
return best
|
| 148 |
+
|
| 149 |
+
def tidy_answer(ans: str, max_sentences: int = MAX_ANSWER_SENTENCES) -> str:
|
| 150 |
+
if not ans:
|
| 151 |
+
return ans
|
| 152 |
+
a = ans
|
| 153 |
+
a = RE_ANGLE_PLACEHOLDER.sub(" ", a)
|
| 154 |
+
a = RE_DOUBLE_DASH_ID.sub(" ", a)
|
| 155 |
+
a = a.replace("…", ". ")
|
| 156 |
+
a = RE_MULTI_DOTS.sub(". ", a)
|
| 157 |
+
a = RE_WHITESPACE.sub(" ", a).strip()
|
| 158 |
+
sents = split_into_sentences(a)
|
| 159 |
+
if not sents:
|
| 160 |
+
snippet = a[:300].strip()
|
| 161 |
+
if not snippet.endswith("."):
|
| 162 |
+
snippet = snippet.rstrip(" .,") + "."
|
| 163 |
+
return snippet
|
| 164 |
+
take = sents[:max_sentences]
|
| 165 |
+
out = " ".join(take).strip()
|
| 166 |
+
if out and out[-1] not in ".!?":
|
| 167 |
+
out = out.rstrip(" .,") + "."
|
| 168 |
+
return out
|
| 169 |
+
|
| 170 |
+
# -------------------------
|
| 171 |
+
# Loading helpers
|
| 172 |
+
# -------------------------
|
| 173 |
+
def load_index_and_meta():
|
| 174 |
+
global _index, _meta, _embed_model
|
| 175 |
+
if _index is None:
|
| 176 |
+
if not INDEX_PATH.exists():
|
| 177 |
+
raise FileNotFoundError(f"FAISS index not found at {INDEX_PATH}. Run src/indexer.py first.")
|
| 178 |
+
_index = faiss.read_index(str(INDEX_PATH))
|
| 179 |
+
if _meta is None:
|
| 180 |
+
if not META_PATH.exists():
|
| 181 |
+
raise FileNotFoundError(f"Meta file not found at {META_PATH}. Run src/indexer.py first.")
|
| 182 |
+
_meta = [json.loads(l) for l in META_PATH.read_text(encoding="utf-8").splitlines()]
|
| 183 |
+
if _embed_model is None:
|
| 184 |
+
_embed_model = SentenceTransformer(EMBED_MODEL)
|
| 185 |
+
return _index, _meta, _embed_model
|
| 186 |
+
|
| 187 |
+
def get_cross_encoder():
|
| 188 |
+
global _cross_encoder
|
| 189 |
+
if _cross_encoder is None:
|
| 190 |
+
_cross_encoder = CrossEncoder(CROSS_ENCODER_MODEL)
|
| 191 |
+
return _cross_encoder
|
| 192 |
+
|
| 193 |
+
def get_local_generator():
|
| 194 |
+
global _local_generator
|
| 195 |
+
if _local_generator is None:
|
| 196 |
+
tok = AutoTokenizer.from_pretrained(LOCAL_GEN_MODEL)
|
| 197 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(LOCAL_GEN_MODEL)
|
| 198 |
+
_local_generator = pipeline("text2text-generation", model=model, tokenizer=tok, device=-1)
|
| 199 |
+
return _local_generator
|
| 200 |
+
|
| 201 |
+
# -------------------------
|
| 202 |
+
# Embedding + retrieval
|
| 203 |
+
# -------------------------
|
| 204 |
+
def embed_query(q: str):
|
| 205 |
+
_, _, em = load_index_and_meta()
|
| 206 |
+
emb = em.encode(q)
|
| 207 |
+
emb = np.asarray(emb, dtype="float32")
|
| 208 |
+
if emb.ndim == 1:
|
| 209 |
+
emb = emb.reshape(1, -1)
|
| 210 |
+
faiss.normalize_L2(emb)
|
| 211 |
+
return emb
|
| 212 |
+
|
| 213 |
+
def retrieve_candidates(query: str, top_k: int = 50):
|
| 214 |
+
index, meta, _ = load_index_and_meta()
|
| 215 |
+
emb = embed_query(query)
|
| 216 |
+
D, I = index.search(emb, top_k)
|
| 217 |
+
results = []
|
| 218 |
+
if len(I) == 0:
|
| 219 |
+
return results
|
| 220 |
+
for score, idx in zip(D[0], I[0]):
|
| 221 |
+
if idx < 0:
|
| 222 |
+
continue
|
| 223 |
+
m = meta[idx]
|
| 224 |
+
results.append({"score": float(score), "url": m["url"], "title": m.get("title",""), "text": m["text"], "id": idx})
|
| 225 |
+
return results
|
| 226 |
+
|
| 227 |
+
# -------------------------
|
| 228 |
+
# Reranking
|
| 229 |
+
# -------------------------
|
| 230 |
+
def rerank_with_cross(query: str, candidates: list, top_n: int = 5):
|
| 231 |
+
if not candidates:
|
| 232 |
+
return []
|
| 233 |
+
cross = get_cross_encoder()
|
| 234 |
+
inputs = [(query, c["text"]) for c in candidates]
|
| 235 |
+
try:
|
| 236 |
+
scores = cross.predict(inputs)
|
| 237 |
+
except Exception as e:
|
| 238 |
+
# fallback: if cross encoder fails, return top_n by original score
|
| 239 |
+
candidates.sort(key=lambda x: x.get("score",0), reverse=True)
|
| 240 |
+
return candidates[:top_n]
|
| 241 |
+
for c, s in zip(candidates, scores):
|
| 242 |
+
c["rerank_score"] = float(s)
|
| 243 |
+
candidates.sort(key=lambda x: x["rerank_score"], reverse=True)
|
| 244 |
+
return candidates[:top_n]
|
| 245 |
+
|
| 246 |
+
# -------------------------
|
| 247 |
+
# Snippet trimming helpers
|
| 248 |
+
# -------------------------
|
| 249 |
+
def trim_snippet_to_sentence(snippet: str, max_chars: int = 800) -> str:
|
| 250 |
+
if not snippet:
|
| 251 |
+
return snippet
|
| 252 |
+
s = snippet.replace("\n", " ").strip()
|
| 253 |
+
if len(s) <= max_chars:
|
| 254 |
+
return s
|
| 255 |
+
head = s[:max_chars]
|
| 256 |
+
last_dot = max(head.rfind("."), head.rfind("!"), head.rfind("?"))
|
| 257 |
+
if last_dot and last_dot > int(max_chars * 0.4):
|
| 258 |
+
return head[:last_dot+1].strip()
|
| 259 |
+
cut = head.rsplit(" ", 1)[0]
|
| 260 |
+
return cut.strip()
|
| 261 |
+
|
| 262 |
+
# -------------------------
|
| 263 |
+
# Prompt template (stronger)
|
| 264 |
+
# -------------------------
|
| 265 |
+
def build_prompt(query: str, contexts: list, sentences_per_context: int = 1):
|
| 266 |
+
"""
|
| 267 |
+
Build a prompt that is explicit about producing an ACTIONABLE, concise answer.
|
| 268 |
+
We include one short sentence per source (selected by semantic similarity).
|
| 269 |
+
"""
|
| 270 |
+
_, _, embed_model = load_index_and_meta()
|
| 271 |
+
sources_block = []
|
| 272 |
+
for i, c in enumerate(contexts, start=1):
|
| 273 |
+
passage = c.get("text", "").strip()
|
| 274 |
+
best_sents = get_top_sentences_from_passage(passage, query, embed_model, top_n=sentences_per_context)
|
| 275 |
+
snippet = " ".join(best_sents)
|
| 276 |
+
snippet = trim_snippet_to_sentence(snippet, max_chars=500)
|
| 277 |
+
snippet = snippet.rstrip(" .") + "." if snippet and snippet[-1] not in ".!?" else snippet
|
| 278 |
+
sources_block.append(f"[SRC_{i}] URL: {c.get('url')}\n[SRC_{i}] TEXT: {snippet}")
|
| 279 |
+
|
| 280 |
+
sources_text = "\n\n".join(sources_block)
|
| 281 |
+
|
| 282 |
+
prompt = textwrap.dedent(f"""
|
| 283 |
+
Use only the following snippets to produce a concise, ACTIONABLE answer (1-2 short sentences) that directly answers the question.
|
| 284 |
+
For "how-to" queries, produce concrete steps or exact fields to set where possible. Do NOT invent facts or add information not present in snippets.
|
| 285 |
+
If snippets do not contain an answer, reply: "I don't know — please consult the documentation." Then list the Source URLs used.
|
| 286 |
+
|
| 287 |
+
{sources_text}
|
| 288 |
+
|
| 289 |
+
Question:
|
| 290 |
+
{query}
|
| 291 |
+
|
| 292 |
+
Answer (be concise and then list Sources used as URLs):
|
| 293 |
+
""").strip()
|
| 294 |
+
return prompt
|
| 295 |
+
|
| 296 |
+
# -------------------------
|
| 297 |
+
# Generation (OpenAI or local) with fallback formatting
|
| 298 |
+
# -------------------------
|
| 299 |
+
def generate_answer_with_context(question: str, contexts: list, use_openai: bool = False):
|
| 300 |
+
prompt = build_prompt(question, contexts, sentences_per_context=1)
|
| 301 |
+
|
| 302 |
+
# Option A: OpenAI (preferred)
|
| 303 |
+
if use_openai and openai is not None and os.environ.get("OPENAI_API_KEY"):
|
| 304 |
+
try:
|
| 305 |
+
resp = openai.ChatCompletion.create(
|
| 306 |
+
model="gpt-3.5-turbo",
|
| 307 |
+
messages=[
|
| 308 |
+
{"role":"system","content":"You are a strict assistant. Use ONLY the provided documentation snippets to answer. Do not hallucinate."},
|
| 309 |
+
{"role":"user","content": prompt}
|
| 310 |
+
],
|
| 311 |
+
temperature=0.0,
|
| 312 |
+
max_tokens=200,
|
| 313 |
+
stop=None
|
| 314 |
+
)
|
| 315 |
+
raw_answer = resp["choices"][0]["message"]["content"].strip()
|
| 316 |
+
answer = tidy_answer(raw_answer, max_sentences=MAX_ANSWER_SENTENCES)
|
| 317 |
+
# fallback if model returned unhelpful phrasing
|
| 318 |
+
if answer.lower().startswith("i'm") or "find the source" in answer.lower() or answer.lower().startswith("see"):
|
| 319 |
+
fallback = extract_fallback_from_contexts(contexts, question)
|
| 320 |
+
fallback = fallback.strip()
|
| 321 |
+
if fallback and not fallback.endswith(('.', '!', '?')):
|
| 322 |
+
fallback = fallback + '.'
|
| 323 |
+
if 'okta' in fallback.lower() or 'authenticator' in fallback.lower():
|
| 324 |
+
fallback = "Enable Okta SAML SSO: " + fallback
|
| 325 |
+
return tidy_answer(fallback, max_sentences=MAX_ANSWER_SENTENCES), [c["url"] for c in contexts]
|
| 326 |
+
used_urls = [c["url"] for c in contexts if c["url"] in raw_answer]
|
| 327 |
+
if not used_urls:
|
| 328 |
+
used_urls = [c["url"] for c in contexts]
|
| 329 |
+
return answer, used_urls
|
| 330 |
+
except Exception as e:
|
| 331 |
+
print("OpenAI generation failed:", e)
|
| 332 |
+
|
| 333 |
+
# Option B: Local generator fallback
|
| 334 |
+
gen = get_local_generator()
|
| 335 |
+
gen_kwargs = {
|
| 336 |
+
"max_length": 200,
|
| 337 |
+
"num_beams": 4,
|
| 338 |
+
"do_sample": False,
|
| 339 |
+
"no_repeat_ngram_size": 3,
|
| 340 |
+
"early_stopping": True
|
| 341 |
+
}
|
| 342 |
+
out = gen(prompt, **gen_kwargs)
|
| 343 |
+
raw_answer = out[0].get("generated_text","").strip()
|
| 344 |
+
answer = tidy_answer(raw_answer, max_sentences=MAX_ANSWER_SENTENCES)
|
| 345 |
+
if answer.lower().startswith("i'm") or "find the source" in answer.lower() or answer.lower().startswith("see"):
|
| 346 |
+
fallback = extract_fallback_from_contexts(contexts, question)
|
| 347 |
+
fallback = fallback.strip()
|
| 348 |
+
if fallback and not fallback.endswith(('.', '!', '?')):
|
| 349 |
+
fallback = fallback + '.'
|
| 350 |
+
if 'okta' in fallback.lower() or 'authenticator' in fallback.lower():
|
| 351 |
+
fallback = "Enable Okta SAML SSO: " + fallback
|
| 352 |
+
return tidy_answer(fallback, max_sentences=MAX_ANSWER_SENTENCES), [c["url"] for c in contexts]
|
| 353 |
+
return answer, [c["url"] for c in contexts]
|
| 354 |
+
|
| 355 |
+
# -------------------------
|
| 356 |
+
# Top-level handler with dedup/diversify
|
| 357 |
+
# -------------------------
|
| 358 |
+
def handle_rag_query(query: str, top_k: int = 5, use_openai: bool = False, rerank_candidates: int = 50):
|
| 359 |
+
candidates = retrieve_candidates(query, top_k=rerank_candidates)
|
| 360 |
+
if not candidates:
|
| 361 |
+
return {"answer": "No relevant documentation found.", "sources": [], "retrieved": []}
|
| 362 |
+
|
| 363 |
+
try:
|
| 364 |
+
top_candidates = rerank_with_cross(query, candidates, top_n=rerank_candidates)
|
| 365 |
+
except Exception as e:
|
| 366 |
+
print("Reranker failed, falling back to FAISS order:", e)
|
| 367 |
+
top_candidates = candidates[:rerank_candidates]
|
| 368 |
+
|
| 369 |
+
# Now pick final top_k diversified by URL:
|
| 370 |
+
# strategy: prefer at most MAX_CHUNKS_PER_URL per url; prefer higher rerank_score
|
| 371 |
+
# first, group by URL preserving order
|
| 372 |
+
url_counts = {}
|
| 373 |
+
diversified = []
|
| 374 |
+
for c in top_candidates:
|
| 375 |
+
url = c.get("url")
|
| 376 |
+
cnt = url_counts.get(url, 0)
|
| 377 |
+
if cnt < MAX_CHUNKS_PER_URL:
|
| 378 |
+
diversified.append(c)
|
| 379 |
+
url_counts[url] = cnt + 1
|
| 380 |
+
# stop early if we have enough
|
| 381 |
+
if len(diversified) >= max(top_k, len(top_candidates)):
|
| 382 |
+
break
|
| 383 |
+
|
| 384 |
+
# final trimming: ensure at most one chunk per URL until we fill top_k
|
| 385 |
+
seen_urls = set()
|
| 386 |
+
unique_candidates = []
|
| 387 |
+
for c in diversified:
|
| 388 |
+
u = c.get("url")
|
| 389 |
+
if u in seen_urls:
|
| 390 |
+
continue
|
| 391 |
+
unique_candidates.append(c)
|
| 392 |
+
seen_urls.add(u)
|
| 393 |
+
if len(unique_candidates) >= top_k:
|
| 394 |
+
break
|
| 395 |
+
# if we don't have enough unique URLs, allow second chunks (already in diversified)
|
| 396 |
+
if len(unique_candidates) < top_k:
|
| 397 |
+
# fill from diversified preserving order but skipping already selected items
|
| 398 |
+
for c in diversified:
|
| 399 |
+
if c in unique_candidates:
|
| 400 |
+
continue
|
| 401 |
+
unique_candidates.append(c)
|
| 402 |
+
if len(unique_candidates) >= top_k:
|
| 403 |
+
break
|
| 404 |
+
final_candidates = unique_candidates[:top_k]
|
| 405 |
+
|
| 406 |
+
# generate answer using final candidates
|
| 407 |
+
answer, urls = generate_answer_with_context(query, final_candidates, use_openai=use_openai)
|
| 408 |
+
|
| 409 |
+
return {"answer": answer, "sources": urls, "retrieved": final_candidates}
|
| 410 |
+
|
| 411 |
+
# small test if run as script
|
| 412 |
+
if __name__ == "__main__":
|
| 413 |
+
q = "How do I configure SAML SSO with Okta?"
|
| 414 |
+
print("Running test query:", q)
|
| 415 |
+
res = handle_rag_query(q, top_k=3, use_openai=False)
|
| 416 |
+
print("ANSWER:\n", res["answer"])
|
| 417 |
+
print("SOURCES:\n", res["sources"])
|
| 418 |
+
for r in res["retrieved"][:3]:
|
| 419 |
+
print("----\n", r["url"], "\n", r["text"][:300])
|
src/scrape_docs.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/scrape_docs.py
|
| 2 |
+
"""
|
| 3 |
+
Crawl allowed Atlan docs and write a cleaned docs_corpus.jsonl.
|
| 4 |
+
Improvements:
|
| 5 |
+
- robust cleaning of encoding artifacts (utf-8 replace + ftfy optional)
|
| 6 |
+
- removes paragraph markers ¶, <placeholders>, group-id--digits tokens
|
| 7 |
+
- strips boilerplate lines and tiny nav lines
|
| 8 |
+
- collapses and normalizes whitespace / encoding
|
| 9 |
+
- removes script/style/header/footer/nav/form tags before extracting
|
| 10 |
+
Output: docs_corpus.jsonl (overwrites)
|
| 11 |
+
"""
|
| 12 |
+
import requests
|
| 13 |
+
import html
|
| 14 |
+
import re
|
| 15 |
+
from bs4 import BeautifulSoup
|
| 16 |
+
from urllib.parse import urljoin, urlparse
|
| 17 |
+
from collections import deque
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from url_normalize import url_normalize
|
| 20 |
+
import ujson as json
|
| 21 |
+
from tqdm import tqdm
|
| 22 |
+
|
| 23 |
+
OUTPUT = Path(__file__).parent.parent.joinpath("docs_corpus.jsonl")
|
| 24 |
+
SEEDS = [
|
| 25 |
+
"https://docs.atlan.com/",
|
| 26 |
+
"https://developer.atlan.com/"
|
| 27 |
+
]
|
| 28 |
+
ALLOWED_DOMAINS = {"docs.atlan.com", "developer.atlan.com"}
|
| 29 |
+
HEADERS = {"User-Agent": "atlan-rag-bot/0.1 (+your_email@example.com)"}
|
| 30 |
+
|
| 31 |
+
# heuristics
|
| 32 |
+
MIN_LINE_WORDS = 3
|
| 33 |
+
MIN_PAGE_WORDS = 30
|
| 34 |
+
|
| 35 |
+
# regex cleanup
|
| 36 |
+
RE_CONTROL = re.compile(r"[\x00-\x1f\x7f-\x9f]")
|
| 37 |
+
RE_PARAGRAPH_MARK = re.compile(r"¶")
|
| 38 |
+
RE_ANGLE_PLACEHOLDER = re.compile(r"<[^>\n]{1,200}>")
|
| 39 |
+
RE_DOUBLE_DASH_ID = re.compile(r"\b[a-zA-Z0-9_-]{3,}--\d{3,}\b")
|
| 40 |
+
RE_MULTIPLE_SPACES = re.compile(r"\s+")
|
| 41 |
+
RE_REPEATED_CHAR = re.compile(r"(.)\1{5,}") # long repeated chars
|
| 42 |
+
RE_BAD_ELLIPSIS = re.compile(r"\.{2,}") # multiple dots
|
| 43 |
+
|
| 44 |
+
BOILERPLATE_KEYWORDS = [
|
| 45 |
+
"table of contents", "overview", "read more", "privacy", "terms", "©", "cookie",
|
| 46 |
+
"search", "related articles", "last updated", "release notes", "subscribe", "breadcrumb"
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
# optional: try to import ftfy for robust fixes (if installed)
|
| 50 |
+
try:
|
| 51 |
+
import ftfy
|
| 52 |
+
except Exception:
|
| 53 |
+
ftfy = None
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def is_allowed(url):
|
| 57 |
+
try:
|
| 58 |
+
return urlparse(url).netloc in ALLOWED_DOMAINS
|
| 59 |
+
except:
|
| 60 |
+
return False
|
| 61 |
+
|
| 62 |
+
def _keep_line(line: str) -> bool:
|
| 63 |
+
s = line.strip().lower()
|
| 64 |
+
if not s:
|
| 65 |
+
return False
|
| 66 |
+
if len(s.split()) < MIN_LINE_WORDS:
|
| 67 |
+
return False
|
| 68 |
+
if s.startswith("http") or s.startswith("www."):
|
| 69 |
+
return False
|
| 70 |
+
for k in BOILERPLATE_KEYWORDS:
|
| 71 |
+
if k in s:
|
| 72 |
+
return False
|
| 73 |
+
# short code-like lines
|
| 74 |
+
if len(s) < 10 and any(ch in s for ch in ['/', '.', '#']):
|
| 75 |
+
return False
|
| 76 |
+
return True
|
| 77 |
+
|
| 78 |
+
def clean_text(soup):
|
| 79 |
+
# remove undesired blocks
|
| 80 |
+
for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
|
| 81 |
+
tag.decompose()
|
| 82 |
+
parts = []
|
| 83 |
+
# only consider headings, paragraphs and list items
|
| 84 |
+
for el in soup.find_all(["h1", "h2", "h3", "p", "li"]):
|
| 85 |
+
t = el.get_text(separator=" ", strip=True)
|
| 86 |
+
if not t:
|
| 87 |
+
continue
|
| 88 |
+
# HTML unescape
|
| 89 |
+
t = html.unescape(t)
|
| 90 |
+
# remove paragraph mark and placeholders
|
| 91 |
+
t = RE_PARAGRAPH_MARK.sub(" ", t)
|
| 92 |
+
t = RE_ANGLE_PLACEHOLDER.sub(" ", t)
|
| 93 |
+
t = RE_DOUBLE_DASH_ID.sub(" ", t)
|
| 94 |
+
# remove control chars
|
| 95 |
+
t = RE_CONTROL.sub(" ", t)
|
| 96 |
+
# remove excessive repeated chars
|
| 97 |
+
t = RE_REPEATED_CHAR.sub(" ", t)
|
| 98 |
+
# normalize ellipsis
|
| 99 |
+
t = RE_BAD_ELLIPSIS.sub(". ", t)
|
| 100 |
+
# collapse whitespace
|
| 101 |
+
t = RE_MULTIPLE_SPACES.sub(" ", t).strip()
|
| 102 |
+
if _keep_line(t):
|
| 103 |
+
parts.append(t)
|
| 104 |
+
joined = "\n\n".join(parts).strip()
|
| 105 |
+
# final normalization: force utf-8 safe output & fix broken chars
|
| 106 |
+
joined = joined.encode('utf-8', errors='replace').decode('utf-8')
|
| 107 |
+
joined = joined.replace("\ufffd", " ")
|
| 108 |
+
# optional stronger fix using ftfy if available
|
| 109 |
+
if ftfy is not None:
|
| 110 |
+
joined = ftfy.fix_text(joined)
|
| 111 |
+
# Remove common weird bytes sequences left by encoding (Â, â etc.)
|
| 112 |
+
joined = joined.replace("Â", "").replace("â", "")
|
| 113 |
+
joined = RE_MULTIPLE_SPACES.sub(" ", joined).strip()
|
| 114 |
+
return joined
|
| 115 |
+
|
| 116 |
+
def crawl(seeds=SEEDS, max_pages=1000, max_depth=2):
|
| 117 |
+
seen = set()
|
| 118 |
+
out = []
|
| 119 |
+
q = deque()
|
| 120 |
+
for s in seeds:
|
| 121 |
+
q.append((s, 0))
|
| 122 |
+
pbar = tqdm(total=max_pages, desc="Crawl", unit="page")
|
| 123 |
+
while q and len(out) < max_pages:
|
| 124 |
+
url, depth = q.popleft()
|
| 125 |
+
url = url_normalize(url)
|
| 126 |
+
if url in seen:
|
| 127 |
+
continue
|
| 128 |
+
if depth > max_depth:
|
| 129 |
+
continue
|
| 130 |
+
if not is_allowed(url):
|
| 131 |
+
seen.add(url)
|
| 132 |
+
continue
|
| 133 |
+
try:
|
| 134 |
+
r = requests.get(url, headers=HEADERS, timeout=12)
|
| 135 |
+
if r.status_code != 200:
|
| 136 |
+
seen.add(url)
|
| 137 |
+
continue
|
| 138 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 139 |
+
title = soup.title.string.strip() if soup.title else url
|
| 140 |
+
text = clean_text(soup)
|
| 141 |
+
if text and len(text.split()) >= MIN_PAGE_WORDS:
|
| 142 |
+
out.append({"url": url, "title": title, "text": text})
|
| 143 |
+
pbar.update(1)
|
| 144 |
+
seen.add(url)
|
| 145 |
+
# find links
|
| 146 |
+
for a in soup.find_all("a", href=True):
|
| 147 |
+
href = urljoin(url, a["href"])
|
| 148 |
+
href = url_normalize(href)
|
| 149 |
+
if is_allowed(href) and href not in seen:
|
| 150 |
+
# skip common media files
|
| 151 |
+
if any(href.lower().endswith(ext) for ext in [".pdf", ".zip", ".png", ".jpg", ".jpeg", ".svg"]):
|
| 152 |
+
continue
|
| 153 |
+
q.append((href, depth + 1))
|
| 154 |
+
except Exception as e:
|
| 155 |
+
# keep going
|
| 156 |
+
seen.add(url)
|
| 157 |
+
continue
|
| 158 |
+
pbar.close()
|
| 159 |
+
# write JSONL (overwrite)
|
| 160 |
+
with OUTPUT.open("w", encoding="utf-8") as f:
|
| 161 |
+
for doc in out:
|
| 162 |
+
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
|
| 163 |
+
print(f"Wrote {len(out)} docs to {OUTPUT}")
|
| 164 |
+
|
| 165 |
+
if __name__ == "__main__":
|
| 166 |
+
crawl(max_pages=400, max_depth=2)
|
streamlit_app.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# streamlit_app.py
|
| 2 |
+
"""
|
| 3 |
+
Wrapper so Hugging Face Spaces (Streamlit SDK) can launch the app.
|
| 4 |
+
It simply runs src/app.py as if it were the main file.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import runpy
|
| 9 |
+
|
| 10 |
+
# Run src/app.py as the main script
|
| 11 |
+
runpy.run_path(str(Path(__file__).parent.joinpath("src", "app.py")), run_name="__main__")
|