ashkunwar commited on
Commit
4ee7173
·
1 Parent(s): 3046482

Update application with enhanced features for Hugging Face deployment

Browse files
.python-version DELETED
@@ -1 +0,0 @@
1
- 3.9
 
 
.streamlit/secrets.toml.template ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Copy this file to .streamlit/secrets.toml and add your actual API key
2
+ # DO NOT commit the actual secrets.toml file to git
3
+
4
+ [default]
5
+ GROQ_API_KEY = "your_groq_api_key_here"
Atlan/Dockerfile ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile for Hugging Face Spaces - Streamlit App
2
+ FROM python:3.11-slim
3
+
4
+ # Install system dependencies
5
+ RUN apt-get update && apt-get install -y \
6
+ curl \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ # Create user for security
10
+ RUN useradd -m -u 1000 user
11
+ USER user
12
+
13
+ # Set environment variables
14
+ ENV HOME=/home/user \
15
+ PATH=/home/user/.local/bin:$PATH \
16
+ PYTHONPATH=$HOME/app \
17
+ PYTHONUNBUFFERED=1
18
+
19
+ # Set working directory
20
+ WORKDIR $HOME/app
21
+
22
+ # Copy requirements first for better Docker layer caching
23
+ COPY --chown=user:user requirements.txt .
24
+
25
+ # Install Python dependencies
26
+ RUN pip install --no-cache-dir --upgrade pip && \
27
+ pip install --no-cache-dir --user -r requirements.txt
28
+
29
+ # Copy the application files
30
+ COPY --chown=user:user . .
31
+
32
+ # Create necessary directories
33
+ RUN mkdir -p $HOME/.streamlit
34
+
35
+ # Create Streamlit config
36
+ RUN echo "\
37
+ [general]\n\
38
+ email = \"\"\n\
39
+ " > $HOME/.streamlit/credentials.toml
40
+
41
+ RUN echo "\
42
+ [server]\n\
43
+ headless = true\n\
44
+ enableCORS = false\n\
45
+ enableXsrfProtection = false\n\
46
+ port = 7860\n\
47
+ " > $HOME/.streamlit/config.toml
48
+
49
+ # Expose the port that Hugging Face Spaces expects
50
+ EXPOSE 7860
51
+
52
+ # Health check
53
+ HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
54
+
55
+ # Command to run the Streamlit app
56
+ CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
Atlan/app.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ st.set_page_config(
3
+ page_title="🎯 Atlan Customer Support Copilot",
4
+ page_icon="🎯",
5
+ layout="wide",
6
+ initial_sidebar_state="expanded"
7
+ )
8
+
9
+ import json
10
+ import asyncio
11
+ import logging
12
+ import os
13
+ from typing import List, Dict
14
+ from datetime import datetime
15
+ import pandas as pd
16
+ import plotly.express as px
17
+ import plotly.graph_objects as go
18
+ from dotenv import load_dotenv
19
+
20
+ load_dotenv()
21
+
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+ try:
26
+ # Try multiple sources for API key: Environment variables first (HF Spaces), then Streamlit secrets
27
+ if 'GROQ_API_KEY' in os.environ:
28
+ st.success("🔑 API key loaded from environment variables")
29
+ elif hasattr(st, 'secrets') and 'GROQ_API_KEY' in st.secrets:
30
+ os.environ['GROQ_API_KEY'] = st.secrets['GROQ_API_KEY']
31
+ st.success("🔑 API key loaded from Streamlit Cloud secrets")
32
+ elif hasattr(st, 'secrets') and hasattr(st.secrets, 'default') and 'GROQ_API_KEY' in st.secrets.default:
33
+ os.environ['GROQ_API_KEY'] = st.secrets.default['GROQ_API_KEY']
34
+ st.success("🔑 API key loaded from Streamlit secrets")
35
+ else:
36
+ st.error("⚠️ GROQ_API_KEY not found!")
37
+ st.info("**For Hugging Face Spaces deployment:**")
38
+ st.info("1. Go to your Space Settings")
39
+ st.info("2. Click 'Variables and secrets' tab")
40
+ st.info("3. Add GROQ_API_KEY with your actual API key")
41
+ st.code("""
42
+ # In Hugging Face Spaces Secrets:
43
+ GROQ_API_KEY = "gsk_your_actual_groq_api_key_here"
44
+ """)
45
+ st.info("**For Streamlit Cloud deployment:**")
46
+ st.info("Add your API key in the Streamlit Cloud app settings > Secrets tab")
47
+ st.info("**For local development:**")
48
+ st.info("Add GROQ_API_KEY to your .env file")
49
+ st.code("""
50
+ # In .env file:
51
+ GROQ_API_KEY=your_groq_api_key_here
52
+ """)
53
+ st.stop()
54
+ except Exception as e:
55
+ st.error(f"⚠️ Error accessing API key: {e}")
56
+ st.error("Please check your configuration")
57
+ st.stop()
58
+
59
+ try:
60
+ from models import Ticket, TicketClassification, TopicTagEnum, SentimentEnum, PriorityEnum
61
+ from classifier import TicketClassifier
62
+ from enhanced_rag import EnhancedRAGPipeline
63
+ except ImportError as e:
64
+ st.error(f"❌ Failed to import required modules: {e}")
65
+ st.error("Please ensure all required files are present")
66
+ st.stop()
67
+
68
+ # Import application modules after environment setup
69
+ try:
70
+ from models import Ticket, TicketClassification, TopicTagEnum, SentimentEnum, PriorityEnum
71
+ from classifier import TicketClassifier
72
+ from enhanced_rag import EnhancedRAGPipeline
73
+ except ImportError as e:
74
+ st.error(f"❌ Failed to import required modules: {e}")
75
+ st.error("Please ensure all required files are present in the directory")
76
+ st.stop()
77
+
78
+ st.markdown("""
79
+ <style>
80
+ .main-header {
81
+ text-align: center;
82
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
83
+ color: white;
84
+ padding: 2rem;
85
+ border-radius: 10px;
86
+ margin-bottom: 2rem;
87
+ }
88
+ .ticket-card {
89
+ border: 1px solid #e1e5e9;
90
+ border-radius: 8px;
91
+ padding: 1rem;
92
+ margin: 1rem 0;
93
+ background: white;
94
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
95
+ }
96
+ .tag {
97
+ background: #667eea;
98
+ color: white;
99
+ padding: 0.2rem 0.5rem;
100
+ border-radius: 15px;
101
+ font-size: 0.8rem;
102
+ margin: 0.2rem;
103
+ display: inline-block;
104
+ }
105
+ .metric-card {
106
+ background: white;
107
+ padding: 1rem;
108
+ border-radius: 8px;
109
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
110
+ text-align: center;
111
+ }
112
+ </style>
113
+ """, unsafe_allow_html=True)
114
+
115
+ @st.cache_resource
116
+ def initialize_ai_models():
117
+ try:
118
+ classifier = TicketClassifier()
119
+ rag_pipeline = EnhancedRAGPipeline(groq_client=classifier.client)
120
+ return classifier, rag_pipeline
121
+ except Exception as e:
122
+ st.error(f"❌ Failed to initialize AI models: {e}")
123
+ return None, None
124
+
125
+ def load_sample_tickets():
126
+ try:
127
+ with open('sample_tickets.json', 'r') as f:
128
+ tickets_data = json.load(f)
129
+ return [Ticket(**ticket_data) for ticket_data in tickets_data]
130
+ except FileNotFoundError:
131
+ st.warning("📋 Sample tickets file not found. Using demo data for cloud deployment.")
132
+ # Create minimal demo data for cloud deployment
133
+ demo_tickets = [
134
+ {
135
+ "id": "DEMO-001",
136
+ "subject": "Demo ticket - Connection issue",
137
+ "body": "This is a demo ticket showing connection problems with our data source."
138
+ },
139
+ {
140
+ "id": "DEMO-002",
141
+ "subject": "Demo ticket - API question",
142
+ "body": "This is a demo ticket asking about API usage and documentation."
143
+ }
144
+ ]
145
+ return [Ticket(**ticket_data) for ticket_data in demo_tickets]
146
+ except Exception as e:
147
+ st.error(f"❌ Error loading tickets: {e}")
148
+ return []
149
+
150
+ async def classify_tickets_async(classifier, tickets):
151
+ try:
152
+ classifications = await classifier.classify_tickets_bulk(tickets)
153
+ return list(zip(tickets, classifications))
154
+ except Exception as e:
155
+ st.error(f"❌ Classification error: {e}")
156
+ return []
157
+
158
+ def run_async(coro):
159
+ try:
160
+ loop = asyncio.get_event_loop()
161
+ except RuntimeError:
162
+ loop = asyncio.new_event_loop()
163
+ asyncio.set_event_loop(loop)
164
+ return loop.run_until_complete(coro)
165
+
166
+ def calculate_stats(classified_tickets):
167
+ if not classified_tickets:
168
+ return {
169
+ 'total': 0,
170
+ 'high_priority': 0,
171
+ 'frustrated': 0,
172
+ 'rag_eligible': 0,
173
+ 'most_common_tag': 'N/A',
174
+ 'tag_counts': {}
175
+ }
176
+
177
+ total = len(classified_tickets)
178
+ high_priority = sum(1 for _, classification in classified_tickets
179
+ if classification.priority == PriorityEnum.P0)
180
+ frustrated = sum(1 for _, classification in classified_tickets
181
+ if classification.sentiment in [SentimentEnum.FRUSTRATED, SentimentEnum.ANGRY])
182
+
183
+ # Count RAG-eligible topics
184
+ rag_topics = ['How-to', 'Product', 'Best practices', 'API/SDK', 'SSO']
185
+ rag_eligible = sum(1 for _, classification in classified_tickets
186
+ if any(tag.value in rag_topics for tag in classification.topic_tags))
187
+
188
+ # Count tag frequencies
189
+ tag_counts = {}
190
+ for _, classification in classified_tickets:
191
+ for tag in classification.topic_tags:
192
+ tag_counts[tag.value] = tag_counts.get(tag.value, 0) + 1
193
+
194
+ most_common_tag = max(tag_counts.keys(), key=lambda x: tag_counts[x]) if tag_counts else 'N/A'
195
+
196
+ return {
197
+ 'total': total,
198
+ 'high_priority': high_priority,
199
+ 'frustrated': frustrated,
200
+ 'rag_eligible': rag_eligible,
201
+ 'most_common_tag': most_common_tag,
202
+ 'tag_counts': tag_counts
203
+ }
204
+
205
+ def display_ticket_card(ticket, classification):
206
+ with st.container():
207
+ st.markdown(f"**{ticket.id}**")
208
+ st.write(f"**Subject:** {ticket.subject}")
209
+ st.write(f"**Message:** {ticket.body[:300]}{'...' if len(ticket.body) > 300 else ''}")
210
+
211
+ st.write("**📋 Topics:**")
212
+ cols = st.columns(len(classification.topic_tags))
213
+ for i, tag in enumerate(classification.topic_tags):
214
+ with cols[i]:
215
+ st.markdown(f'<span style="background: #667eea; color: white; padding: 0.2rem 0.5rem; border-radius: 10px; font-size: 0.8rem; margin: 0.1rem;">{tag.value}</span>', unsafe_allow_html=True)
216
+
217
+ sentiment_color = '#ff6b6b' if 'frustrated' in classification.sentiment.value.lower() else '#ff3838' if 'angry' in classification.sentiment.value.lower() else '#4ecdc4' if 'curious' in classification.sentiment.value.lower() else '#95a5a6'
218
+ st.markdown(f"**😊 Sentiment:** <span style='background: {sentiment_color}; color: white; padding: 0.3rem 0.8rem; border-radius: 15px; font-size: 0.9rem;'>{classification.sentiment.value}</span>", unsafe_allow_html=True)
219
+
220
+ priority_color = '#ff3838' if 'P0' in classification.priority.value else '#ffa726' if 'P1' in classification.priority.value else '#66bb6a'
221
+ st.markdown(f"**🔥 Priority:** <span style='background: {priority_color}; color: white; padding: 0.3rem 0.8rem; border-radius: 15px; font-size: 0.9rem;'>{classification.priority.value}</span>", unsafe_allow_html=True)
222
+
223
+ st.write(f"**🤖 AI Reasoning:** {classification.reasoning}")
224
+ st.divider()
225
+
226
+ def main():
227
+ classifier, rag_pipeline = initialize_ai_models()
228
+
229
+ if classifier is None or rag_pipeline is None:
230
+ st.stop()
231
+
232
+ st.markdown("""
233
+ <div class="main-header">
234
+ <h1>🎯 Atlan Customer Support Copilot</h1>
235
+ <p>AI-powered ticket classification and intelligent response generation</p>
236
+ </div>
237
+ """, unsafe_allow_html=True)
238
+
239
+ # Sidebar navigation
240
+ st.sidebar.title("🧭 Navigation")
241
+ page = st.sidebar.selectbox("Choose a page", [
242
+ "📊 Bulk Classification Dashboard",
243
+ "🤖 Interactive AI Agent",
244
+ "📝 Single Ticket Classification",
245
+ "📂 Upload & Classify"
246
+ ])
247
+
248
+ # Page routing
249
+ if page == "📊 Bulk Classification Dashboard":
250
+ bulk_dashboard_page(classifier)
251
+ elif page == "🤖 Interactive AI Agent":
252
+ interactive_agent_page(classifier, rag_pipeline)
253
+ elif page == "📝 Single Ticket Classification":
254
+ single_ticket_page(classifier)
255
+ elif page == "📂 Upload & Classify":
256
+ upload_classify_page(classifier)
257
+
258
+ def bulk_dashboard_page(classifier):
259
+ """Bulk classification dashboard page"""
260
+ st.header("📊 Bulk Classification Dashboard")
261
+ st.subheader("Auto-loaded sample tickets with AI classification")
262
+
263
+ # Initialize session state for bulk results
264
+ if 'bulk_results' not in st.session_state:
265
+ st.session_state.bulk_results = None
266
+
267
+ # Auto-load bulk results
268
+ if st.session_state.bulk_results is None:
269
+ with st.spinner("🔄 Loading and classifying sample tickets..."):
270
+ tickets = load_sample_tickets()
271
+ if tickets:
272
+ try:
273
+ classified_tickets = run_async(classify_tickets_async(classifier, tickets))
274
+ st.session_state.bulk_results = classified_tickets
275
+ st.success(f"✅ Successfully classified {len(classified_tickets)} tickets!")
276
+ except Exception as e:
277
+ st.error(f"❌ Error during classification: {e}")
278
+ st.session_state.bulk_results = []
279
+ else:
280
+ st.session_state.bulk_results = []
281
+
282
+ if st.session_state.bulk_results:
283
+ # Display statistics
284
+ stats = calculate_stats(st.session_state.bulk_results)
285
+
286
+ col1, col2, col3, col4, col5 = st.columns(5)
287
+ with col1:
288
+ st.metric("📋 Total Tickets", stats['total'])
289
+ with col2:
290
+ st.metric("🚨 High Priority", stats['high_priority'])
291
+ with col3:
292
+ st.metric("😤 Frustrated/Angry", stats['frustrated'])
293
+ with col4:
294
+ st.metric("🤖 RAG-Eligible", stats['rag_eligible'])
295
+ with col5:
296
+ st.metric("🏷️ Top Topic", stats['most_common_tag'])
297
+
298
+ # Visualizations
299
+ if stats['tag_counts']:
300
+ col1, col2 = st.columns(2)
301
+
302
+ with col1:
303
+ # Priority distribution
304
+ priority_data = {}
305
+ for _, classification in st.session_state.bulk_results:
306
+ priority = classification.priority.value
307
+ priority_data[priority] = priority_data.get(priority, 0) + 1
308
+
309
+ fig_priority = px.pie(
310
+ values=list(priority_data.values()),
311
+ names=list(priority_data.keys()),
312
+ title="📊 Priority Distribution",
313
+ color_discrete_map={
314
+ 'P0 (High)': '#ff3838',
315
+ 'P1 (Medium)': '#ffa726',
316
+ 'P2 (Low)': '#66bb6a'
317
+ }
318
+ )
319
+ st.plotly_chart(fig_priority, use_container_width=True)
320
+
321
+ with col2:
322
+ # Topic distribution
323
+ fig_tags = px.bar(
324
+ x=list(stats['tag_counts'].values()),
325
+ y=list(stats['tag_counts'].keys()),
326
+ orientation='h',
327
+ title="🏷️ Topic Distribution",
328
+ labels={'x': 'Count', 'y': 'Topics'}
329
+ )
330
+ fig_tags.update_layout(height=400)
331
+ st.plotly_chart(fig_tags, use_container_width=True)
332
+
333
+ # Display tickets with filters
334
+ st.subheader("📋 All Classified Tickets")
335
+
336
+ col1, col2, col3 = st.columns(3)
337
+ with col1:
338
+ priority_filter = st.selectbox("Filter by Priority",
339
+ ["All"] + [p.value for p in PriorityEnum])
340
+ with col2:
341
+ sentiment_filter = st.selectbox("Filter by Sentiment",
342
+ ["All"] + [s.value for s in SentimentEnum])
343
+ with col3:
344
+ topic_filter = st.selectbox("Filter by Topic",
345
+ ["All"] + [t.value for t in TopicTagEnum])
346
+
347
+ # Apply filters
348
+ filtered_results = st.session_state.bulk_results
349
+ if priority_filter != "All":
350
+ filtered_results = [(t, c) for t, c in filtered_results if c.priority.value == priority_filter]
351
+ if sentiment_filter != "All":
352
+ filtered_results = [(t, c) for t, c in filtered_results if c.sentiment.value == sentiment_filter]
353
+ if topic_filter != "All":
354
+ filtered_results = [(t, c) for t, c in filtered_results if any(tag.value == topic_filter for tag in c.topic_tags)]
355
+
356
+ st.info(f"Showing {len(filtered_results)} of {len(st.session_state.bulk_results)} tickets")
357
+
358
+ # Display filtered tickets
359
+ for ticket, classification in filtered_results:
360
+ display_ticket_card(ticket, classification)
361
+
362
+ # Refresh button
363
+ if st.button("🔄 Refresh Classifications"):
364
+ st.session_state.bulk_results = None
365
+ st.rerun()
366
+
367
+ def interactive_agent_page(classifier, rag_pipeline):
368
+ """Interactive AI agent page"""
369
+ st.header("🤖 Interactive AI Agent")
370
+ st.subheader("Submit a new ticket or question from any channel")
371
+
372
+ # Input form
373
+ with st.form("interactive_form"):
374
+ question = st.text_area(
375
+ "Customer Question or Ticket:",
376
+ placeholder="Enter the customer's question or ticket description...",
377
+ height=150
378
+ )
379
+
380
+ channel = st.selectbox(
381
+ "Channel:",
382
+ ["Web", "Email", "WhatsApp", "Voice", "Live Chat"]
383
+ )
384
+
385
+ submit_button = st.form_submit_button("🚀 Process with AI Agent")
386
+
387
+ if submit_button and question:
388
+ with st.spinner("🤖 Analyzing question and generating response..."):
389
+ try:
390
+ # Create a dummy ticket for classification
391
+ ticket = Ticket(id="INTERACTIVE-001", subject=question[:80], body=question)
392
+
393
+ # Classify the ticket
394
+ classification = run_async(classifier.classify_ticket(ticket))
395
+ topic_tags = [tag.value for tag in classification.topic_tags]
396
+
397
+ # Generate response using RAG pipeline
398
+ rag_result = run_async(rag_pipeline.generate_answer(question, topic_tags))
399
+
400
+ # Display results in two columns
401
+ col1, col2 = st.columns(2)
402
+
403
+ with col1:
404
+ st.subheader("📊 Internal Analysis (Back-end View)")
405
+
406
+ st.markdown(f"""
407
+ **🏷️ Topic Tags:** {', '.join([f'`{tag}`' for tag in topic_tags])}
408
+
409
+ **😊 Sentiment:** `{classification.sentiment.value}`
410
+
411
+ **⚡ Priority:** `{classification.priority.value}`
412
+
413
+ **🤖 AI Reasoning:** {classification.reasoning}
414
+ """)
415
+
416
+ with col2:
417
+ st.subheader("💬 Final Response (Front-end View)")
418
+
419
+ if rag_result['type'] == 'direct_answer':
420
+ st.success("💡 Direct Answer (RAG-Generated)")
421
+ st.write(rag_result['answer'])
422
+
423
+ if rag_result.get('sources'):
424
+ st.subheader("📚 Sources:")
425
+ for source in rag_result['sources']:
426
+ st.markdown(f"- [{source}]({source})")
427
+ else:
428
+ st.warning("📋 Ticket Routed")
429
+ st.write(rag_result['message'])
430
+
431
+ except Exception as e:
432
+ st.error(f"❌ Error processing question: {e}")
433
+
434
+ def single_ticket_page(classifier):
435
+ """Single ticket classification page"""
436
+ st.header("📝 Single Ticket Classification")
437
+
438
+ with st.form("single_ticket_form"):
439
+ ticket_id = st.text_input("Ticket ID:", placeholder="e.g., TICKET-001")
440
+ subject = st.text_input("Subject:", placeholder="Enter ticket subject")
441
+ body = st.text_area("Message Body:", placeholder="Enter the full ticket message...", height=150)
442
+
443
+ classify_button = st.form_submit_button("🔍 Classify Ticket")
444
+
445
+ if classify_button and ticket_id and subject and body:
446
+ with st.spinner("🔄 Classifying ticket..."):
447
+ try:
448
+ ticket = Ticket(id=ticket_id, subject=subject, body=body)
449
+ classification = run_async(classifier.classify_ticket(ticket))
450
+
451
+ st.success("✅ Classification complete!")
452
+ display_ticket_card(ticket, classification)
453
+
454
+ except Exception as e:
455
+ st.error(f"❌ Error classifying ticket: {e}")
456
+
457
+ def upload_classify_page(classifier):
458
+ """Upload and classify page"""
459
+ st.header("📂 Upload & Classify Tickets")
460
+
461
+ uploaded_file = st.file_uploader("Choose a JSON file", type="json")
462
+
463
+ if uploaded_file is not None:
464
+ try:
465
+ tickets_data = json.load(uploaded_file)
466
+ tickets = [Ticket(**ticket_data) for ticket_data in tickets_data]
467
+
468
+ st.info(f"📄 Loaded {len(tickets)} tickets from file")
469
+
470
+ if st.button("🚀 Classify All Tickets"):
471
+ with st.spinner("🔄 Classifying tickets..."):
472
+ try:
473
+ classified_tickets = run_async(classify_tickets_async(classifier, tickets))
474
+
475
+ st.success(f"✅ Successfully classified {len(classified_tickets)} tickets!")
476
+
477
+ # Display statistics
478
+ stats = calculate_stats(classified_tickets)
479
+ col1, col2, col3, col4 = st.columns(4)
480
+ with col1:
481
+ st.metric("Total", stats['total'])
482
+ with col2:
483
+ st.metric("High Priority", stats['high_priority'])
484
+ with col3:
485
+ st.metric("Frustrated", stats['frustrated'])
486
+ with col4:
487
+ st.metric("RAG-Eligible", stats['rag_eligible'])
488
+
489
+ # Display tickets
490
+ for ticket, classification in classified_tickets:
491
+ display_ticket_card(ticket, classification)
492
+
493
+ except Exception as e:
494
+ st.error(f"❌ Error during classification: {e}")
495
+
496
+ except Exception as e:
497
+ st.error(f"❌ Error loading file: {e}")
498
+
499
+ # Footer
500
+ def show_footer():
501
+ """Display footer"""
502
+ st.markdown("---")
503
+ st.markdown("""
504
+ <div style="text-align: center; color: #666; padding: 1rem;">
505
+ <p>🎯 <strong>Atlan Customer Support Copilot</strong> - AI-powered ticket classification and response generation</p>
506
+ <p>Built with Streamlit • Powered by Groq AI • Enhanced RAG Pipeline</p>
507
+ </div>
508
+ """, unsafe_allow_html=True)
509
+
510
+ # Run the app
511
+ if __name__ == "__main__":
512
+ main()
513
+ show_footer()
Atlan/requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.28,<2
2
+ groq>=0.31
3
+ pydantic>=2.11,<3
4
+ python-dotenv>=1.1
5
+ httpx>=0.28
6
+ requests>=2.32
7
+ aiohttp>=3.12
8
+ beautifulsoup4>=4.13
9
+
10
+ # If you don't strictly need lxml, delete the next line to avoid native deps.
11
+ lxml==6.0.1
12
+
13
+ numpy==1.26.4
14
+ pandas==2.2.2
15
+ scikit-learn==1.5.2
16
+ sentence-transformers>=2.2
17
+ plotly>=5.17.0
DEPLOYMENT_GUIDE.md ADDED
File without changes
Dockerfile ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile for Hugging Face Spaces - Streamlit App
2
+ FROM python:3.9-slim
3
+
4
+ # Create user for security
5
+ RUN useradd -m -u 1000 user
6
+ USER user
7
+
8
+ # Set environment variables
9
+ ENV HOME=/home/user \
10
+ PATH=/home/user/.local/bin:$PATH \
11
+ PYTHONPATH=$HOME/app \
12
+ PYTHONUNBUFFERED=1
13
+
14
+ # Set working directory
15
+ WORKDIR $HOME/app
16
+
17
+ # Copy requirements first for better Docker layer caching
18
+ COPY --chown=user:user requirements.txt .
19
+
20
+ # Install Python dependencies
21
+ RUN pip install --no-cache-dir --upgrade pip && \
22
+ pip install --no-cache-dir --user -r requirements.txt
23
+
24
+ # Copy the application files
25
+ COPY --chown=user:user . .
26
+
27
+ # Create necessary directories
28
+ RUN mkdir -p $HOME/.streamlit
29
+
30
+ # Create Streamlit config
31
+ RUN echo "\
32
+ [general]\n\
33
+ email = \"\"\n\
34
+ " > $HOME/.streamlit/credentials.toml
35
+
36
+ RUN echo "\
37
+ [server]\n\
38
+ headless = true\n\
39
+ enableCORS = false\n\
40
+ enableXsrfProtection = false\n\
41
+ port = 7860\n\
42
+ " > $HOME/.streamlit/config.toml
43
+
44
+ # Expose the port that Hugging Face Spaces expects
45
+ EXPOSE 7860
46
+
47
+ # Health check
48
+ HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
49
+
50
+ # Command to run the Streamlit app
51
+ CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
Dockerfile.fastapi ADDED
File without changes
README_HF.md ADDED
File without changes
app.py CHANGED
@@ -23,18 +23,25 @@ logging.basicConfig(level=logging.INFO)
23
  logger = logging.getLogger(__name__)
24
 
25
  try:
26
- # Try Streamlit Cloud secrets first, then fall back to .env
27
  if hasattr(st, 'secrets') and 'GROQ_API_KEY' in st.secrets:
28
  os.environ['GROQ_API_KEY'] = st.secrets['GROQ_API_KEY']
29
  st.success("🔑 API key loaded from Streamlit Cloud secrets")
30
- elif 'GROQ_API_KEY' not in os.environ:
 
 
 
 
 
31
  st.error("⚠️ GROQ_API_KEY not found!")
32
- st.info("**For Streamlit Cloud deployment:**")
33
- st.info("Add your API key in the Streamlit Cloud app settings > Secrets tab")
34
  st.code("""
35
- # In Streamlit Cloud Secrets:
36
  GROQ_API_KEY = "your_groq_api_key_here"
37
  """)
 
 
38
  st.info("**For local development:**")
39
  st.info("Add GROQ_API_KEY to your .env file")
40
  st.code("""
@@ -42,22 +49,11 @@ try:
42
  GROQ_API_KEY=your_groq_api_key_here
43
  """)
44
  st.stop()
45
- else:
46
- st.success("🔑 API key loaded from environment")
47
  except Exception as e:
48
  st.error(f"⚠️ Error accessing API key: {e}")
49
  st.error("Please check your configuration")
50
  st.stop()
51
 
52
- try:
53
- from models import Ticket, TicketClassification, TopicTagEnum, SentimentEnum, PriorityEnum
54
- from classifier import TicketClassifier
55
- from enhanced_rag import EnhancedRAGPipeline
56
- except ImportError as e:
57
- st.error(f"❌ Failed to import required modules: {e}")
58
- st.error("Please ensure all required files are present")
59
- st.stop()
60
-
61
  # Import application modules after environment setup
62
  try:
63
  from models import Ticket, TicketClassification, TopicTagEnum, SentimentEnum, PriorityEnum
 
23
  logger = logging.getLogger(__name__)
24
 
25
  try:
26
+ # Try multiple sources for API key: Streamlit secrets, environment variables, .env file
27
  if hasattr(st, 'secrets') and 'GROQ_API_KEY' in st.secrets:
28
  os.environ['GROQ_API_KEY'] = st.secrets['GROQ_API_KEY']
29
  st.success("🔑 API key loaded from Streamlit Cloud secrets")
30
+ elif 'GROQ_API_KEY' in os.environ:
31
+ st.success("🔑 API key loaded from environment variables")
32
+ elif hasattr(st, 'secrets') and hasattr(st.secrets, 'default') and 'GROQ_API_KEY' in st.secrets.default:
33
+ os.environ['GROQ_API_KEY'] = st.secrets.default['GROQ_API_KEY']
34
+ st.success("🔑 API key loaded from Streamlit secrets")
35
+ else:
36
  st.error("⚠️ GROQ_API_KEY not found!")
37
+ st.info("**For Hugging Face Spaces deployment:**")
38
+ st.info("Add your API key in the Space settings > Secrets tab")
39
  st.code("""
40
+ # In Hugging Face Spaces Secrets:
41
  GROQ_API_KEY = "your_groq_api_key_here"
42
  """)
43
+ st.info("**For Streamlit Cloud deployment:**")
44
+ st.info("Add your API key in the Streamlit Cloud app settings > Secrets tab")
45
  st.info("**For local development:**")
46
  st.info("Add GROQ_API_KEY to your .env file")
47
  st.code("""
 
49
  GROQ_API_KEY=your_groq_api_key_here
50
  """)
51
  st.stop()
 
 
52
  except Exception as e:
53
  st.error(f"⚠️ Error accessing API key: {e}")
54
  st.error("Please check your configuration")
55
  st.stop()
56
 
 
 
 
 
 
 
 
 
 
57
  # Import application modules after environment setup
58
  try:
59
  from models import Ticket, TicketClassification, TopicTagEnum, SentimentEnum, PriorityEnum
deploy_prep.bat ADDED
File without changes
deploy_prep.sh ADDED
File without changes
fastapi_app.py ADDED
File without changes
main.py DELETED
@@ -1,284 +0,0 @@
1
- import os
2
- import json
3
- import logging
4
- from typing import List, Dict
5
- from fastapi import FastAPI, HTTPException, Request, File, UploadFile, Form
6
- from fastapi.responses import HTMLResponse, JSONResponse
7
- from dotenv import load_dotenv
8
- import uvicorn
9
- import httpx
10
-
11
- from models import (
12
- Ticket,
13
- TicketClassification,
14
- ClassifiedTicket,
15
- SingleTicketRequest,
16
- BulkTicketRequest,
17
- ClassificationResponse
18
- )
19
- from classifier import TicketClassifier
20
-
21
- # Setup logging
22
- logging.basicConfig(level=logging.INFO)
23
- logger = logging.getLogger(__name__)
24
-
25
- # Load environment variables
26
- load_dotenv()
27
-
28
- # Initialize FastAPI app
29
- app = FastAPI(
30
- title="Atlan Customer Support Copilot",
31
- description="AI-powered ticket classification and response generation",
32
- version="1.0.0"
33
- )
34
-
35
- # Initialize the classifier
36
- classifier = TicketClassifier()
37
-
38
- async def rag_pipeline(question: str, topic_tags: List[str]) -> Dict:
39
- """Enhanced RAG pipeline with proper knowledge retrieval"""
40
- try:
41
- # Import the enhanced RAG system
42
- from enhanced_rag import EnhancedRAGPipeline
43
-
44
- # Initialize RAG pipeline with Groq client from classifier
45
- rag = EnhancedRAGPipeline(groq_client=classifier.client)
46
-
47
- # Generate answer using the enhanced pipeline
48
- result = await rag.generate_answer(question, topic_tags)
49
- return result
50
-
51
- except ImportError as e:
52
- logger.warning(f"Enhanced RAG system not available: {e}")
53
- # Fallback to basic routing if enhanced RAG fails
54
- return await fallback_rag_pipeline(question, topic_tags)
55
-
56
- except Exception as e:
57
- logger.error(f"RAG pipeline error: {e}")
58
- # Fallback to basic routing if enhanced RAG fails
59
- return await fallback_rag_pipeline(question, topic_tags)
60
-
61
- async def fallback_rag_pipeline(question: str, topic_tags: List[str]) -> Dict:
62
- """Fallback RAG pipeline for when enhanced system is not available"""
63
- if any(tag in ["How-to", "Product", "Best practices", "API/SDK", "SSO"] for tag in topic_tags):
64
- # Basic knowledge responses
65
- context = f"Based on Atlan documentation for topics: {', '.join(topic_tags)}"
66
-
67
- return {
68
- "type": "direct_answer",
69
- "answer": f"Based on the documentation, here's information about: {question}. {context}",
70
- "sources": ["https://docs.atlan.com/", "https://developer.atlan.com/"]
71
- }
72
- else:
73
- return {
74
- "type": "routing",
75
- "message": f"This ticket has been classified as a '{topic_tags[0] if topic_tags else 'General'}' issue and routed to the appropriate team."
76
- }
77
-
78
- @app.get("/")
79
- async def root():
80
- """API root endpoint."""
81
- return {
82
- "message": "Atlan Customer Support Copilot API",
83
- "version": "1.0.0",
84
- "endpoints": [
85
- "/health",
86
- "/classify-single",
87
- "/classify-bulk",
88
- "/bulk-dashboard",
89
- "/interactive-agent",
90
- "/sample-tickets"
91
- ]
92
- }
93
-
94
- @app.post("/classify-single", response_model=ClassificationResponse)
95
- async def classify_single_ticket(request: SingleTicketRequest):
96
- """Classify a single support ticket."""
97
- try:
98
- classification = await classifier.classify_ticket(request.ticket)
99
- classified_ticket = ClassifiedTicket(
100
- ticket=request.ticket,
101
- classification=classification
102
- )
103
-
104
- return ClassificationResponse(
105
- success=True,
106
- data=[classified_ticket],
107
- total_processed=1
108
- )
109
-
110
- except Exception as e:
111
- raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
112
-
113
- @app.post("/classify-bulk", response_model=ClassificationResponse)
114
- async def classify_bulk_tickets(request: BulkTicketRequest):
115
- """Classify multiple support tickets."""
116
- try:
117
- if not request.tickets:
118
- raise HTTPException(status_code=400, detail="No tickets provided")
119
-
120
- classifications = await classifier.classify_tickets_bulk(request.tickets)
121
-
122
- classified_tickets = [
123
- ClassifiedTicket(ticket=ticket, classification=classification)
124
- for ticket, classification in zip(request.tickets, classifications)
125
- ]
126
-
127
- return ClassificationResponse(
128
- success=True,
129
- data=classified_tickets,
130
- total_processed=len(classified_tickets)
131
- )
132
-
133
- except Exception as e:
134
- raise HTTPException(status_code=500, detail=f"Bulk classification failed: {str(e)}")
135
-
136
- @app.get("/sample-tickets", response_model=ClassificationResponse)
137
- async def classify_sample_tickets():
138
- """Load and classify the sample tickets from the JSON file."""
139
- try:
140
- # Load sample tickets
141
- sample_file_path = "sample_tickets.json"
142
- if not os.path.exists(sample_file_path):
143
- raise HTTPException(status_code=404, detail="Sample tickets file not found")
144
-
145
- with open(sample_file_path, "r") as f:
146
- tickets_data = json.load(f)
147
-
148
- # Convert to Ticket objects
149
- tickets = [Ticket(**ticket_data) for ticket_data in tickets_data]
150
-
151
- # Classify all tickets
152
- classifications = await classifier.classify_tickets_bulk(tickets)
153
-
154
- classified_tickets = [
155
- ClassifiedTicket(ticket=ticket, classification=classification)
156
- for ticket, classification in zip(tickets, classifications)
157
- ]
158
-
159
- return ClassificationResponse(
160
- success=True,
161
- data=classified_tickets,
162
- total_processed=len(classified_tickets)
163
- )
164
-
165
- except Exception as e:
166
- raise HTTPException(status_code=500, detail=f"Failed to process sample tickets: {str(e)}")
167
-
168
- @app.get("/bulk-dashboard", response_model=ClassificationResponse)
169
- async def bulk_dashboard():
170
- """Automatically load and classify all sample tickets for the bulk dashboard on page load."""
171
- try:
172
- # Load sample tickets
173
- sample_file_path = "sample_tickets.json"
174
- if not os.path.exists(sample_file_path):
175
- logger.warning(f"Sample tickets file not found: {sample_file_path}")
176
- return ClassificationResponse(
177
- success=True,
178
- data=[],
179
- total_processed=0
180
- )
181
-
182
- with open(sample_file_path, "r") as f:
183
- tickets_data = json.load(f)
184
-
185
- logger.info(f"Loaded {len(tickets_data)} sample tickets for bulk processing")
186
-
187
- # Convert to Ticket objects
188
- tickets = [Ticket(**ticket_data) for ticket_data in tickets_data]
189
-
190
- # Classify all tickets
191
- classifications = await classifier.classify_tickets_bulk(tickets)
192
-
193
- classified_tickets = [
194
- ClassifiedTicket(ticket=ticket, classification=classification)
195
- for ticket, classification in zip(tickets, classifications)
196
- ]
197
-
198
- logger.info(f"Successfully classified {len(classified_tickets)} tickets for bulk dashboard")
199
-
200
- return ClassificationResponse(
201
- success=True,
202
- data=classified_tickets,
203
- total_processed=len(classified_tickets)
204
- )
205
-
206
- except Exception as e:
207
- logger.error(f"Failed to process bulk dashboard: {str(e)}")
208
- raise HTTPException(status_code=500, detail=f"Failed to process bulk dashboard: {str(e)}")
209
-
210
- @app.post("/upload-tickets", response_model=ClassificationResponse)
211
- async def upload_and_classify_tickets(file: UploadFile = File(...)):
212
- """Upload a JSON file and classify the tickets."""
213
- try:
214
- if not file.filename.endswith('.json'):
215
- raise HTTPException(status_code=400, detail="File must be a JSON file")
216
-
217
- content = await file.read()
218
- tickets_data = json.loads(content)
219
-
220
- # Convert to Ticket objects
221
- tickets = [Ticket(**ticket_data) for ticket_data in tickets_data]
222
-
223
- # Classify all tickets
224
- classifications = await classifier.classify_tickets_bulk(tickets)
225
-
226
- classified_tickets = [
227
- ClassifiedTicket(ticket=ticket, classification=classification)
228
- for ticket, classification in zip(tickets, classifications)
229
- ]
230
-
231
- return ClassificationResponse(
232
- success=True,
233
- data=classified_tickets,
234
- total_processed=len(classified_tickets)
235
- )
236
-
237
- except json.JSONDecodeError:
238
- raise HTTPException(status_code=400, detail="Invalid JSON file")
239
- except Exception as e:
240
- raise HTTPException(status_code=500, detail=f"Failed to process uploaded tickets: {str(e)}")
241
-
242
- @app.post("/interactive-agent")
243
- async def interactive_agent(
244
- question: str = Form(...),
245
- channel: str = Form("web")
246
- ):
247
- """Interactive endpoint for new ticket/question submission."""
248
- # Create a dummy ticket
249
- ticket = Ticket(id="INTERACTIVE-001", subject=question[:80], body=question)
250
- classification = await classifier.classify_ticket(ticket)
251
- topic_tags = [tag.value for tag in classification.topic_tags]
252
- # Internal analysis view
253
- analysis = {
254
- "topic_tags": topic_tags,
255
- "sentiment": classification.sentiment.value,
256
- "priority": classification.priority.value,
257
- "reasoning": classification.reasoning
258
- }
259
- # Final response view
260
- rag_topics = ["How-to", "Product", "Best practices", "API/SDK", "SSO"]
261
- if any(tag in rag_topics for tag in topic_tags):
262
- rag_result = await rag_pipeline(question, topic_tags)
263
- final_response = {
264
- "type": "direct_answer",
265
- "answer": rag_result.get("answer", "No answer found."),
266
- "sources": rag_result.get("sources", [])
267
- }
268
- else:
269
- final_response = {
270
- "type": "routing",
271
- "message": f"This ticket has been classified as a '{topic_tags[0]}' issue and routed to the appropriate team."
272
- }
273
- return JSONResponse({
274
- "internal_analysis": analysis,
275
- "final_response": final_response
276
- })
277
-
278
- @app.get("/health")
279
- async def health_check():
280
- """Health check endpoint."""
281
- return {"status": "healthy", "service": "Atlan Customer Support Copilot"}
282
-
283
- if __name__ == "__main__":
284
- uvicorn.run(app, host="127.0.0.1", port=8000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
  streamlit>=1.28,<2
 
 
2
  groq>=0.31
3
  pydantic>=2.11,<3
4
  python-dotenv>=1.1
 
1
  streamlit>=1.28,<2
2
+ fastapi>=0.104.0
3
+ uvicorn[standard]>=0.24.0
4
  groq>=0.31
5
  pydantic>=2.11,<3
6
  python-dotenv>=1.1
scraper.py DELETED
@@ -1,291 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- import asyncio
4
- import aiohttp
5
- import json
6
- import re
7
- from bs4 import BeautifulSoup
8
- from urllib.parse import urljoin, urlparse
9
- from pathlib import Path
10
- import time
11
- from typing import List, Dict, Set
12
- import logging
13
-
14
- logging.basicConfig(level=logging.INFO)
15
- logger = logging.getLogger(__name__)
16
-
17
- class AtlanDocScraper:
18
- def __init__(self):
19
- self.session = None
20
- self.scraped_urls = set()
21
- self.knowledge_base = []
22
- self.base_urls = {
23
- "docs": "https://docs.atlan.com/",
24
- "developer": "https://developer.atlan.com/"
25
- }
26
- self.max_pages_per_site = 50
27
- self.delay_between_requests = 1
28
-
29
- async def create_session(self):
30
- """Create an aiohttp session with proper headers"""
31
- headers = {
32
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
33
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
34
- 'Accept-Language': 'en-US,en;q=0.5',
35
- 'Accept-Encoding': 'gzip, deflate',
36
- 'Connection': 'keep-alive'
37
- }
38
- timeout = aiohttp.ClientTimeout(total=30)
39
- self.session = aiohttp.ClientSession(headers=headers, timeout=timeout)
40
-
41
- async def close_session(self):
42
- """Close the aiohttp session"""
43
- if self.session:
44
- await self.session.close()
45
-
46
- def clean_text(self, text: str) -> str:
47
- """Clean and normalize text content"""
48
- if not text:
49
- return ""
50
-
51
- # Remove extra whitespace and normalize
52
- text = re.sub(r'\s+', ' ', text.strip())
53
-
54
- # Remove common navigation elements
55
- text = re.sub(r'(Home|Navigation|Menu|Footer|Header|Sidebar)', '', text, flags=re.IGNORECASE)
56
-
57
- # Remove very short content
58
- if len(text) < 50:
59
- return ""
60
-
61
- return text
62
-
63
- def extract_main_content(self, soup: BeautifulSoup) -> str:
64
- """Extract main content from HTML, focusing on documentation"""
65
-
66
- # Try to find main content areas
67
- content_selectors = [
68
- 'main',
69
- 'article',
70
- '.content',
71
- '.main-content',
72
- '.documentation',
73
- '.docs-content',
74
- '#content',
75
- '.markdown-body',
76
- '.prose'
77
- ]
78
-
79
- main_content = ""
80
-
81
- for selector in content_selectors:
82
- content_elem = soup.select_one(selector)
83
- if content_elem:
84
- main_content = content_elem.get_text(separator=' ', strip=True)
85
- break
86
-
87
- # Fallback: get all text but filter out navigation
88
- if not main_content:
89
- # Remove navigation, footer, header elements
90
- for tag in soup.find_all(['nav', 'footer', 'header', 'aside']):
91
- tag.decompose()
92
-
93
- main_content = soup.get_text(separator=' ', strip=True)
94
-
95
- return self.clean_text(main_content)
96
-
97
- def extract_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
98
- """Extract relevant internal links from the page"""
99
- links = []
100
-
101
- for link in soup.find_all('a', href=True):
102
- href = link['href']
103
- full_url = urljoin(base_url, href)
104
-
105
- # Only include links from the same domain
106
- if urlparse(full_url).netloc in [urlparse(url).netloc for url in self.base_urls.values()]:
107
- # Filter out non-documentation links
108
- if not any(skip in full_url.lower() for skip in ['#', 'mailto:', 'tel:', 'javascript:']):
109
- links.append(full_url)
110
-
111
- return list(set(links)) # Remove duplicates
112
-
113
- async def scrape_page(self, url: str) -> Dict:
114
- """Scrape a single page and extract content"""
115
- if url in self.scraped_urls:
116
- return None
117
-
118
- try:
119
- logger.info(f"Scraping: {url}")
120
-
121
- async with self.session.get(url) as response:
122
- if response.status != 200:
123
- logger.warning(f"Failed to fetch {url}: {response.status}")
124
- return None
125
-
126
- html = await response.text()
127
- soup = BeautifulSoup(html, 'html.parser')
128
-
129
- # Extract metadata
130
- title = soup.find('title')
131
- title_text = title.get_text().strip() if title else ""
132
-
133
- # Extract main content
134
- content = self.extract_main_content(soup)
135
-
136
- if not content:
137
- logger.warning(f"No content extracted from {url}")
138
- return None
139
-
140
- # Extract links for further crawling
141
- links = self.extract_links(soup, url)
142
-
143
- self.scraped_urls.add(url)
144
-
145
- return {
146
- 'url': url,
147
- 'title': title_text,
148
- 'content': content,
149
- 'links': links,
150
- 'timestamp': time.time(),
151
- 'source': 'docs' if 'docs.atlan.com' in url else 'developer'
152
- }
153
-
154
- except Exception as e:
155
- logger.error(f"Error scraping {url}: {str(e)}")
156
- return None
157
-
158
- async def crawl_site(self, base_url: str, max_pages: int = 50) -> List[Dict]:
159
- """Crawl a site starting from base URL"""
160
- pages_data = []
161
- urls_to_visit = [base_url]
162
- visited = set()
163
-
164
- while urls_to_visit and len(pages_data) < max_pages:
165
- current_url = urls_to_visit.pop(0)
166
-
167
- if current_url in visited:
168
- continue
169
-
170
- visited.add(current_url)
171
-
172
- # Scrape the page
173
- page_data = await self.scrape_page(current_url)
174
-
175
- if page_data:
176
- pages_data.append(page_data)
177
-
178
- # Add new links to visit (limit to avoid infinite crawling)
179
- new_links = [link for link in page_data['links']
180
- if link not in visited and link not in urls_to_visit]
181
- urls_to_visit.extend(new_links[:10]) # Limit new links per page
182
-
183
- # Be respectful - add delay between requests
184
- await asyncio.sleep(self.delay_between_requests)
185
-
186
- return pages_data
187
-
188
- async def scrape_all_sites(self) -> List[Dict]:
189
- """Scrape all configured sites"""
190
- await self.create_session()
191
-
192
- try:
193
- all_pages = []
194
-
195
- for site_name, base_url in self.base_urls.items():
196
- logger.info(f"Starting to crawl {site_name}: {base_url}")
197
- site_pages = await self.crawl_site(base_url, self.max_pages_per_site)
198
- all_pages.extend(site_pages)
199
- logger.info(f"Scraped {len(site_pages)} pages from {site_name}")
200
-
201
- # Delay between sites
202
- await asyncio.sleep(2)
203
-
204
- self.knowledge_base = all_pages
205
- return all_pages
206
-
207
- finally:
208
- await self.close_session()
209
-
210
- def save_knowledge_base(self, filename: str = "atlan_knowledge_base.json"):
211
- """Save the scraped knowledge base to a JSON file"""
212
- output_path = Path(filename)
213
-
214
- with open(output_path, 'w', encoding='utf-8') as f:
215
- json.dump(self.knowledge_base, f, indent=2, ensure_ascii=False)
216
-
217
- logger.info(f"Knowledge base saved to {output_path}")
218
- logger.info(f"Total pages: {len(self.knowledge_base)}")
219
-
220
- # Print summary statistics
221
- source_counts = {}
222
- for page in self.knowledge_base:
223
- source = page.get('source', 'unknown')
224
- source_counts[source] = source_counts.get(source, 0) + 1
225
-
226
- logger.info(f"Pages by source: {source_counts}")
227
-
228
- def load_knowledge_base(self, filename: str = "atlan_knowledge_base.json") -> List[Dict]:
229
- """Load existing knowledge base from file"""
230
- try:
231
- with open(filename, 'r', encoding='utf-8') as f:
232
- self.knowledge_base = json.load(f)
233
- logger.info(f"Loaded {len(self.knowledge_base)} pages from {filename}")
234
- return self.knowledge_base
235
- except FileNotFoundError:
236
- logger.warning(f"Knowledge base file {filename} not found")
237
- return []
238
- except Exception as e:
239
- logger.error(f"Error loading knowledge base: {str(e)}")
240
- return []
241
-
242
- async def main():
243
- """Main function to run the scraper"""
244
- scraper = AtlanDocScraper()
245
-
246
- print("🕷️ Starting Atlan Documentation Scraper...")
247
- print("=" * 50)
248
-
249
- # Check if knowledge base already exists
250
- existing_kb = scraper.load_knowledge_base()
251
-
252
- if existing_kb:
253
- print(f"📚 Found existing knowledge base with {len(existing_kb)} pages")
254
- response = input("Do you want to re-scrape? (y/N): ").strip().lower()
255
- if response != 'y':
256
- print("✅ Using existing knowledge base")
257
- return
258
-
259
- print("🚀 Starting web scraping...")
260
- print("⏱️ This may take several minutes...")
261
-
262
- start_time = time.time()
263
-
264
- try:
265
- pages = await scraper.scrape_all_sites()
266
- scraper.save_knowledge_base()
267
-
268
- end_time = time.time()
269
- duration = end_time - start_time
270
-
271
- print(f"\n✅ Scraping completed!")
272
- print(f"📊 Statistics:")
273
- print(f" - Total pages scraped: {len(pages)}")
274
- print(f" - Time taken: {duration:.2f} seconds")
275
- print(f" - Average time per page: {duration/len(pages):.2f} seconds")
276
-
277
- # Show sample of scraped content
278
- if pages:
279
- print(f"\n📄 Sample page:")
280
- sample = pages[0]
281
- print(f" - Title: {sample['title'][:100]}...")
282
- print(f" - URL: {sample['url']}")
283
- print(f" - Content length: {len(sample['content'])} characters")
284
-
285
- except KeyboardInterrupt:
286
- print("\n⚠️ Scraping interrupted by user")
287
- except Exception as e:
288
- print(f"\n❌ Error during scraping: {str(e)}")
289
-
290
- if __name__ == "__main__":
291
- asyncio.run(main())