raksa-the-wildcats commited on
Commit
f8c0dab
Β·
1 Parent(s): d8024c0

first commit

Browse files
README.md CHANGED
@@ -1,12 +1,29 @@
1
  ---
2
- title: DeepSeek Test
3
- emoji: 🐠
4
- colorFrom: blue
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.34.2
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Chatbot
3
+ emoji: πŸ”₯
4
+ colorFrom: green
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.34.2
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # Web Accessibility Chatbot
13
+
14
+ An AI-powered learning assistant for university students studying web accessibility, built with WebAIM resources and DeepSeek-R1.
15
+
16
+ ## Features
17
+ - Answers based on authoritative WebAIM documentation
18
+ - Proper source citations
19
+ - Student-friendly explanations
20
+ - Code examples and best practices
21
+ - Assignment guidance
22
+
23
+ ## Setup
24
+ 1. Upload your WebAIM PDFs to the `pdfs/` directory
25
+ 2. Run the PDF processor to create the knowledge base
26
+ 3. Set your Hugging Face token in the environment variables
27
+ 4. Deploy to Hugging Face Spaces
28
+
29
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from huggingface_hub import InferenceClient
4
+ from utils.retriever import KnowledgeRetriever
5
+ import json
6
+
7
+ class AccessibilityChatbot:
8
+ def __init__(self):
9
+ # Initialize DeepSeek-R1 client
10
+ self.client = InferenceClient(
11
+ model="deepseek-ai/DeepSeek-R1",
12
+ token=os.getenv("HF_TOKEN")
13
+ )
14
+
15
+ # Initialize knowledge retriever
16
+ self.retriever = KnowledgeRetriever()
17
+
18
+ # System prompt for accessibility education
19
+ self.system_prompt = """You are an expert web accessibility instructor helping university students learn about web accessibility.
20
+
21
+ Your knowledge comes from WebAIM resources, which are authoritative sources for web accessibility information.
22
+
23
+ Guidelines for responses:
24
+ 1. Provide clear, student-friendly explanations
25
+ 2. Use the provided WebAIM context to answer questions accurately
26
+ 3. Always cite your sources by mentioning the WebAIM document and page number
27
+ 4. Include practical examples and code snippets when relevant
28
+ 5. Break down complex concepts into digestible parts
29
+ 6. Encourage best practices and standards compliance
30
+ 7. If asked about assignments, provide actionable guidance
31
+
32
+ Remember: You're teaching students, so be encouraging and educational while maintaining accuracy."""
33
+
34
+ def generate_response(self, message, history):
35
+ """Generate response using DeepSeek-R1 with WebAIM context"""
36
+
37
+ # Retrieve relevant content from WebAIM PDFs
38
+ relevant_content = self.retriever.retrieve_relevant_content(message)
39
+ context = self.retriever.format_context_for_llm(relevant_content)
40
+
41
+ # Prepare messages for the LLM
42
+ messages = [
43
+ {"role": "system", "content": f"{self.system_prompt}\n\nContext from WebAIM resources:\n{context}"}
44
+ ]
45
+
46
+ # Add conversation history
47
+ for human, assistant in history:
48
+ messages.append({"role": "user", "content": human})
49
+ messages.append({"role": "assistant", "content": assistant})
50
+
51
+ # Add current message
52
+ messages.append({"role": "user", "content": message})
53
+
54
+ try:
55
+ response = self.client.chat_completion(
56
+ messages=messages,
57
+ max_tokens=1500,
58
+ temperature=0.7,
59
+ top_p=0.9
60
+ )
61
+
62
+ assistant_response = response.choices[0].message.content
63
+
64
+ # Add source information
65
+ if relevant_content and assistant_response:
66
+ sources = self.format_sources(relevant_content)
67
+ assistant_response += f"\n\n**Sources:**\n{sources}"
68
+
69
+ return assistant_response or "I apologize, but I couldn't generate a response. Please try again."
70
+
71
+ except Exception as e:
72
+ return f"I apologize, but I'm experiencing technical difficulties. Please try again. Error: {str(e)}"
73
+
74
+ def format_sources(self, content_list):
75
+ """Format source citations for display"""
76
+ sources = []
77
+ seen_sources = set()
78
+
79
+ for item in content_list:
80
+ source_key = f"{item['source_file']}_{item['page_number']}"
81
+ if source_key not in seen_sources:
82
+ sources.append(f"β€’ {item['source_file']} (Page {item['page_number']})")
83
+ seen_sources.add(source_key)
84
+
85
+ return "\n".join(sources)
86
+
87
+ # Initialize chatbot
88
+ chatbot = AccessibilityChatbot()
89
+
90
+ # Create Gradio interface
91
+ def create_interface():
92
+ # Custom CSS for improved styling
93
+ custom_css = """
94
+ .gradio-container {
95
+ max-width: 1200px !important;
96
+ margin: 0 auto !important;
97
+ }
98
+
99
+ .main-header {
100
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
101
+ color: white;
102
+ padding: 2rem;
103
+ border-radius: 15px;
104
+ margin-bottom: 2rem;
105
+ text-align: center;
106
+ box-shadow: 0 8px 32px rgba(0,0,0,0.1);
107
+ }
108
+
109
+ .main-header h1 {
110
+ margin: 0;
111
+ font-size: 2.5rem;
112
+ font-weight: 700;
113
+ text-shadow: 0 2px 4px rgba(0,0,0,0.3);
114
+ }
115
+
116
+ .main-header p {
117
+ margin: 1rem 0 0 0;
118
+ font-size: 1.1rem;
119
+ opacity: 0.9;
120
+ }
121
+
122
+ .feature-grid {
123
+ display: grid;
124
+ grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
125
+ gap: 1rem;
126
+ margin: 2rem 0;
127
+ }
128
+
129
+ .feature-card {
130
+ background: white;
131
+ padding: 1.5rem;
132
+ border-radius: 12px;
133
+ border: 1px solid #e1e5e9;
134
+ box-shadow: 0 4px 6px rgba(0,0,0,0.05);
135
+ transition: transform 0.2s, box-shadow 0.2s;
136
+ }
137
+
138
+ .feature-card:hover {
139
+ transform: translateY(-2px);
140
+ box-shadow: 0 8px 25px rgba(0,0,0,0.1);
141
+ }
142
+
143
+ .feature-card h3 {
144
+ color: #667eea;
145
+ margin: 0 0 0.5rem 0;
146
+ font-size: 1.2rem;
147
+ }
148
+
149
+ .chat-container {
150
+ background: white;
151
+ border-radius: 15px;
152
+ padding: 2rem;
153
+ box-shadow: 0 8px 32px rgba(0,0,0,0.1);
154
+ border: 1px solid #e1e5e9;
155
+ }
156
+
157
+ .input-container {
158
+ background: #f8f9fa;
159
+ border-radius: 12px;
160
+ padding: 1.5rem;
161
+ margin-top: 1rem;
162
+ }
163
+
164
+ .examples-section {
165
+ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
166
+ color: white;
167
+ padding: 2rem;
168
+ border-radius: 15px;
169
+ margin: 2rem 0;
170
+ }
171
+
172
+ .examples-section h3 {
173
+ margin: 0 0 1rem 0;
174
+ font-size: 1.5rem;
175
+ }
176
+
177
+ .resources-section {
178
+ background: #f8f9fa;
179
+ border-radius: 15px;
180
+ padding: 2rem;
181
+ margin: 2rem 0;
182
+ border: 1px solid #e1e5e9;
183
+ }
184
+
185
+ .footer {
186
+ text-align: center;
187
+ padding: 2rem;
188
+ color: #6c757d;
189
+ border-top: 1px solid #e1e5e9;
190
+ margin-top: 2rem;
191
+ }
192
+
193
+ .gradio-button {
194
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
195
+ border: none !important;
196
+ border-radius: 8px !important;
197
+ color: white !important;
198
+ font-weight: 600 !important;
199
+ padding: 12px 24px !important;
200
+ transition: all 0.3s ease !important;
201
+ }
202
+
203
+ .gradio-button:hover {
204
+ transform: translateY(-2px) !important;
205
+ box-shadow: 0 8px 25px rgba(102, 126, 234, 0.4) !important;
206
+ }
207
+
208
+ .gradio-textbox {
209
+ border-radius: 12px !important;
210
+ border: 2px solid #e1e5e9 !important;
211
+ transition: border-color 0.3s ease !important;
212
+ }
213
+
214
+ .gradio-textbox:focus-within {
215
+ border-color: #667eea !important;
216
+ box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
217
+ }
218
+
219
+ .chatbot-container {
220
+ border-radius: 12px !important;
221
+ border: 1px solid #e1e5e9 !important;
222
+ background: white !important;
223
+ }
224
+ """
225
+
226
+ with gr.Blocks(
227
+ title="Web Accessibility Learning Assistant",
228
+ css=custom_css
229
+ ) as demo:
230
+
231
+ # Header
232
+ with gr.Row():
233
+ with gr.Column(scale=1):
234
+ gr.HTML("""
235
+ <div class="main-header">
236
+ <h1>🌐 Web Accessibility Learning Assistant</h1>
237
+ <p>Your personal tutor for mastering web accessibility using authoritative WebAIM resources</p>
238
+ </div>
239
+ """)
240
+
241
+ # Feature highlights
242
+ with gr.Row():
243
+ with gr.Column(scale=1):
244
+ gr.HTML("""
245
+ <div class="feature-grid">
246
+ <div class="feature-card">
247
+ <h3>πŸ“‹ WCAG Guidelines</h3>
248
+ <p>Master success criteria and implementation strategies with expert guidance</p>
249
+ </div>
250
+ <div class="feature-card">
251
+ <h3>πŸ” Screen Reader Testing</h3>
252
+ <p>Learn how to test with assistive technologies like NVDA and JAWS</p>
253
+ </div>
254
+ <div class="feature-card">
255
+ <h3>πŸ’» Code Examples</h3>
256
+ <p>Get practical HTML, CSS, and JavaScript patterns for accessibility</p>
257
+ </div>
258
+ <div class="feature-card">
259
+ <h3>🎯 Best Practices</h3>
260
+ <p>Discover real-world accessibility solutions and common pitfalls</p>
261
+ </div>
262
+ </div>
263
+ """)
264
+
265
+ # Main chat interface
266
+ with gr.Row():
267
+ with gr.Column(scale=1):
268
+ gr.HTML('<div class="chat-container">')
269
+
270
+ chatbot_interface = gr.Chatbot(
271
+ height=600,
272
+ placeholder="πŸ‘‹ Ask me anything about web accessibility! I'm here to help you learn.",
273
+ show_label=False,
274
+ container=True,
275
+ bubble_full_width=False,
276
+ elem_classes=["chatbot-container"]
277
+ )
278
+
279
+ gr.HTML('</div>')
280
+
281
+ # Input section
282
+ with gr.Row():
283
+ with gr.Column(scale=1):
284
+ gr.HTML('<div class="input-container">')
285
+
286
+ msg = gr.Textbox(
287
+ placeholder="Type your question here... (e.g., 'How do I write good alt text?' or 'What are the WCAG contrast requirements?')",
288
+ label="Your Question",
289
+ lines=3,
290
+ max_lines=6,
291
+ elem_classes=["gradio-textbox"]
292
+ )
293
+
294
+ with gr.Row():
295
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Chat", variant="secondary", size="sm")
296
+ submit_btn = gr.Button("πŸš€ Ask Question", variant="primary", size="lg")
297
+
298
+ gr.HTML('</div>')
299
+
300
+ # Quick start examples
301
+ with gr.Row():
302
+ with gr.Column(scale=1):
303
+ gr.HTML("""
304
+ <div class="examples-section">
305
+ <h3>πŸš€ Quick Start Examples</h3>
306
+ <p>Click any example below to get started with common accessibility questions:</p>
307
+ </div>
308
+ """)
309
+
310
+ gr.Examples(
311
+ examples=[
312
+ "What are the WCAG 2.1 AA requirements for color contrast?",
313
+ "How do I make forms accessible to screen readers?",
314
+ "What's the difference between aria-label and aria-labelledby?",
315
+ "How can I test my website with a screen reader?",
316
+ "What are the most common accessibility mistakes students make?",
317
+ "How do I write effective alt text for complex images?",
318
+ "What ARIA roles should I use for a navigation menu?",
319
+ "How do I make data tables accessible?",
320
+ "What are the keyboard navigation requirements?",
321
+ "How do I ensure my site works without JavaScript?"
322
+ ],
323
+ inputs=msg,
324
+ examples_per_page=5,
325
+ label="Example Questions"
326
+ )
327
+
328
+ # Additional resources
329
+ with gr.Row():
330
+ with gr.Column(scale=1):
331
+ gr.HTML("""
332
+ <div class="resources-section">
333
+ <h3>πŸ“š Additional Learning Resources</h3>
334
+ </div>
335
+ """)
336
+
337
+ with gr.Accordion("πŸ› οΈ Recommended Tools", open=False):
338
+ gr.Markdown("""
339
+ ### Essential Accessibility Testing Tools:
340
+
341
+ **πŸ” Automated Testing:**
342
+ - **WAVE**: Web accessibility evaluation tool (wave.webaim.org)
343
+ - **axe DevTools**: Browser extension for accessibility testing
344
+ - **Lighthouse**: Built-in accessibility audit in Chrome DevTools
345
+ - **HTML_CodeSniffer**: Bookmarklet for quick accessibility checks
346
+
347
+ **🎧 Screen Readers:**
348
+ - **NVDA**: Free screen reader for Windows
349
+ - **JAWS**: Professional screen reader (paid)
350
+ - **VoiceOver**: Built-in screen reader for macOS
351
+ - **TalkBack**: Android screen reader
352
+
353
+ **🎨 Color & Contrast:**
354
+ - **WebAIM Contrast Checker**: Verify color contrast ratios
355
+ - **Color Oracle**: Simulate color blindness
356
+ - **Stark**: Design tool with accessibility features
357
+ """)
358
+
359
+ with gr.Accordion("πŸ“‹ Key Standards & Guidelines", open=False):
360
+ gr.Markdown("""
361
+ ### Web Accessibility Standards:
362
+
363
+ **🌐 WCAG 2.1:**
364
+ - **Level A**: Basic accessibility requirements
365
+ - **Level AA**: Standard compliance (most common target)
366
+ - **Level AAA**: Highest level of accessibility
367
+
368
+ **πŸ‡ΊπŸ‡Έ US Standards:**
369
+ - **Section 508**: Federal accessibility requirements
370
+ - **ADA**: Americans with Disabilities Act considerations
371
+ - **CVAA**: 21st Century Communications and Video Accessibility Act
372
+
373
+ **🌍 International:**
374
+ - **EN 301 549**: European accessibility standard
375
+ - **ISO 9241-171**: International ergonomics standard
376
+ """)
377
+
378
+ # Footer
379
+ with gr.Row():
380
+ with gr.Column(scale=1):
381
+ gr.HTML("""
382
+ <div class="footer">
383
+ <p><strong>This chatbot uses authoritative WebAIM resources and is powered by DeepSeek-R1.</strong></p>
384
+ <p>For the most up-to-date information, always refer to the original WebAIM documentation at <a href="https://webaim.org" target="_blank">webaim.org</a></p>
385
+ </div>
386
+ """)
387
+
388
+ # Handle message submission
389
+ def respond(message, history):
390
+ if not message.strip():
391
+ return history, ""
392
+
393
+ response = chatbot.generate_response(message, history)
394
+ history.append((message, response))
395
+ return history, ""
396
+
397
+ def clear_chat():
398
+ return [], ""
399
+
400
+ # Event handlers
401
+ msg.submit(respond, [msg, chatbot_interface], [chatbot_interface, msg])
402
+ submit_btn.click(respond, [msg, chatbot_interface], [chatbot_interface, msg])
403
+ clear_btn.click(clear_chat, outputs=[chatbot_interface, msg])
404
+
405
+ return demo
406
+
407
+ # Launch the app
408
+ if __name__ == "__main__":
409
+ demo = create_interface()
410
+ demo.launch(
411
+ server_name="0.0.0.0",
412
+ server_port=7860,
413
+ share=False,
414
+ show_error=True
415
+ )
knowledge_base.json ADDED
The diff for this file is too large to render. See raw diff
 
pdf_processor.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import json
3
+ import os
4
+ import re
5
+ from sentence_transformers import SentenceTransformer
6
+ import pickle
7
+
8
+ class PDFProcessor:
9
+ def __init__(self, pdf_directory="/Users/maraksa/Downloads/chatbot/WebAIM/"):
10
+ self.pdf_directory = pdf_directory
11
+ self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
12
+
13
+ # Check if directory exists
14
+ if not os.path.exists(pdf_directory):
15
+ os.makedirs(pdf_directory)
16
+ print(f"Created directory: {pdf_directory}")
17
+ print("Please add your WebAIM PDF files to this directory.")
18
+
19
+ def clean_text(self, text):
20
+ """Clean extracted text from PDF"""
21
+ # Remove extra whitespace and line breaks
22
+ text = re.sub(r'\s+', ' ', text)
23
+
24
+ # Remove common PDF artifacts
25
+ text = re.sub(r'Page \d+ of \d+', '', text)
26
+ text = re.sub(r'WebAIM.*?\n', '', text)
27
+
28
+ return text.strip()
29
+
30
+ def extract_text_from_pdf(self, pdf_path):
31
+ """Extract text from PDF with page information"""
32
+ print(f"Processing: {os.path.basename(pdf_path)}")
33
+ doc = fitz.open(pdf_path)
34
+ pages_content = []
35
+
36
+ for page_num in range(len(doc)):
37
+ page = doc[page_num]
38
+ text = page.get_text()
39
+
40
+ # Clean the text
41
+ cleaned_text = self.clean_text(text)
42
+
43
+ # Skip pages with very little content
44
+ if len(cleaned_text) < 50:
45
+ continue
46
+
47
+ # Clean and chunk text
48
+ chunks = self.chunk_text(cleaned_text, chunk_size=500)
49
+
50
+ for chunk_idx, chunk in enumerate(chunks):
51
+ if len(chunk.strip()) > 30: # Only keep substantial chunks
52
+ pages_content.append({
53
+ 'text': chunk,
54
+ 'source_file': os.path.basename(pdf_path),
55
+ 'page_number': page_num + 1,
56
+ 'chunk_id': chunk_idx,
57
+ 'source_type': 'WebAIM'
58
+ })
59
+
60
+ doc.close()
61
+ print(f"βœ… Extracted {len(pages_content)} chunks from {os.path.basename(pdf_path)}")
62
+ return pages_content
63
+
64
+ def chunk_text(self, text, chunk_size=500, overlap=50):
65
+ """Split text into overlapping chunks"""
66
+ words = text.split()
67
+ chunks = []
68
+
69
+ for i in range(0, len(words), chunk_size - overlap):
70
+ chunk = ' '.join(words[i:i + chunk_size])
71
+ if chunk.strip():
72
+ chunks.append(chunk.strip())
73
+
74
+ return chunks
75
+
76
+ def process_all_pdfs(self):
77
+ """Process all PDFs in the directory"""
78
+ all_content = []
79
+
80
+ # Check if PDFs exist
81
+ pdf_files = [f for f in os.listdir(self.pdf_directory) if f.endswith('.pdf')]
82
+
83
+ if not pdf_files:
84
+ print(f"❌ No PDF files found in {self.pdf_directory}")
85
+ print("Please add your WebAIM PDF files to the pdfs/ directory")
86
+ return []
87
+
88
+ print(f"Found {len(pdf_files)} PDF files:")
89
+ for pdf_file in pdf_files:
90
+ print(f" - {pdf_file}")
91
+
92
+ for filename in pdf_files:
93
+ pdf_path = os.path.join(self.pdf_directory, filename)
94
+ try:
95
+ content = self.extract_text_from_pdf(pdf_path)
96
+ all_content.extend(content)
97
+ except Exception as e:
98
+ print(f"❌ Error processing {filename}: {str(e)}")
99
+
100
+ return all_content
101
+
102
+ def create_knowledge_base(self, output_path="knowledge_base.json"):
103
+ """Create searchable knowledge base from PDFs"""
104
+ print("πŸš€ Starting PDF processing...")
105
+ all_content = self.process_all_pdfs()
106
+
107
+ if not all_content:
108
+ print("❌ No content extracted. Please check your PDF files.")
109
+ return None
110
+
111
+ print(f"πŸ“„ Total chunks extracted: {len(all_content)}")
112
+ print("🧠 Creating embeddings... (this may take a few minutes)")
113
+
114
+ texts = [item['text'] for item in all_content]
115
+ embeddings = self.embedder.encode(texts, show_progress_bar=True)
116
+
117
+ # Save knowledge base
118
+ knowledge_base = {
119
+ 'content': all_content,
120
+ 'embeddings': embeddings.tolist(),
121
+ 'metadata': {
122
+ 'total_chunks': len(all_content),
123
+ 'embedding_model': 'all-MiniLM-L6-v2',
124
+ 'chunk_size': 500,
125
+ 'overlap': 50
126
+ }
127
+ }
128
+
129
+ with open(output_path, 'w') as f:
130
+ json.dump(knowledge_base, f, indent=2)
131
+
132
+ print(f"βœ… Knowledge base saved to {output_path}")
133
+ print(f"πŸ“Š Summary:")
134
+ print(f" - Total chunks: {len(all_content)}")
135
+ print(f" - Embedding dimensions: {len(embeddings[0])}")
136
+ print(f" - File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")
137
+
138
+ return knowledge_base
139
+
140
+ # Usage
141
+ if __name__ == "__main__":
142
+ processor = PDFProcessor()
143
+ knowledge_base = processor.create_knowledge_base()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ huggingface_hub>=0.20.0
3
+ sentence-transformers>=2.2.0
4
+ scikit-learn>=1.3.0
5
+ numpy>=1.24.0
6
+ PyMuPDF>=1.23.0
7
+ python-dotenv>=1.0.0
utils/__init__.py ADDED
File without changes
utils/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (147 Bytes). View file
 
utils/__pycache__/retriever.cpython-312.pyc ADDED
Binary file (2.74 kB). View file
 
utils/retriever.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+
6
+ class KnowledgeRetriever:
7
+ def __init__(self, knowledge_base_path="knowledge_base.json"):
8
+ self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
9
+
10
+ # Load knowledge base
11
+ with open(knowledge_base_path, 'r') as f:
12
+ self.kb = json.load(f)
13
+
14
+ self.content = self.kb['content']
15
+ self.embeddings = np.array(self.kb['embeddings'])
16
+
17
+ def retrieve_relevant_content(self, query, top_k=5, min_similarity=0.3):
18
+ """Retrieve most relevant content for the query"""
19
+
20
+ # Encode query
21
+ query_embedding = self.embedder.encode([query])
22
+
23
+ # Calculate similarities
24
+ similarities = cosine_similarity(query_embedding, self.embeddings)[0]
25
+
26
+ # Get top results above threshold
27
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
28
+
29
+ relevant_content = []
30
+ for idx in top_indices:
31
+ if similarities[idx] >= min_similarity:
32
+ content_item = self.content[idx].copy()
33
+ content_item['similarity_score'] = float(similarities[idx])
34
+ relevant_content.append(content_item)
35
+
36
+ return relevant_content
37
+
38
+ def format_context_for_llm(self, relevant_content):
39
+ """Format retrieved content for LLM context"""
40
+ if not relevant_content:
41
+ return "No relevant information found in WebAIM resources."
42
+
43
+ context = "Relevant information from WebAIM resources:\n\n"
44
+
45
+ for i, item in enumerate(relevant_content, 1):
46
+ context += f"[Source {i}] From {item['source_file']} (Page {item['page_number']}):\n"
47
+ context += f"{item['text']}\n\n"
48
+
49
+ return context