File size: 18,520 Bytes
9743f5c
 
 
 
729df82
7b8b9f8
1fe9127
53fbd70
7b8b9f8
 
 
9743f5c
d544279
9743f5c
7b8b9f8
75f5073
 
7b8b9f8
 
75f5073
7b8b9f8
 
 
 
75f5073
 
 
 
d544279
75f5073
7b8b9f8
 
 
 
75f5073
d544279
 
75f5073
7b8b9f8
75f5073
729df82
7b8b9f8
 
 
 
75f5073
9743f5c
 
75f5073
7b8b9f8
 
 
75f5073
04bf77d
7b8b9f8
1fe9127
7b8b9f8
75f5073
 
7b8b9f8
 
75f5073
 
 
7b8b9f8
75f5073
 
 
d544279
7b8b9f8
 
75f5073
7b8b9f8
 
75f5073
7b8b9f8
75f5073
 
 
 
7b8b9f8
75f5073
 
7b8b9f8
75f5073
7b8b9f8
 
 
 
 
53fbd70
 
7b8b9f8
75f5073
7b8b9f8
d544279
7b8b9f8
d544279
75f5073
7b8b9f8
 
 
 
75f5073
7b8b9f8
 
d544279
7b8b9f8
 
 
 
d544279
7b8b9f8
 
 
 
 
d544279
7b8b9f8
 
 
 
 
53fbd70
d544279
75f5073
 
 
 
 
 
 
 
 
 
 
 
 
d544279
7b8b9f8
75f5073
 
 
d544279
75f5073
53fbd70
7b8b9f8
75f5073
7b8b9f8
 
 
75f5073
7b8b9f8
 
 
 
75f5073
7b8b9f8
 
75f5073
7b8b9f8
 
 
 
75f5073
7b8b9f8
 
 
 
 
 
 
 
 
75f5073
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b8b9f8
d544279
75f5073
d544279
75f5073
7b8b9f8
75f5073
d544279
 
 
7b8b9f8
75f5073
d544279
7b8b9f8
75f5073
d544279
7b8b9f8
75f5073
 
d544279
 
 
75f5073
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d544279
75f5073
d544279
 
 
 
 
75f5073
d544279
75f5073
d544279
 
 
 
75f5073
d544279
75f5073
d544279
 
75f5073
d544279
75f5073
 
 
 
 
 
 
 
 
 
d544279
 
75f5073
d544279
 
75f5073
 
d544279
75f5073
d544279
75f5073
 
 
 
d544279
75f5073
 
d544279
75f5073
7b8b9f8
75f5073
d544279
 
 
75f5073
d544279
75f5073
 
d544279
75f5073
7b8b9f8
 
d544279
7b8b9f8
75f5073
7b8b9f8
9743f5c
75f5073
 
9743f5c
d544279
75f5073
7b8b9f8
75f5073
 
d544279
75f5073
 
 
 
7b8b9f8
75f5073
7b8b9f8
75f5073
1fe9127
75f5073
 
d544279
75f5073
d544279
75f5073
d544279
 
 
53fbd70
d544279
75f5073
 
7b8b9f8
d544279
7b8b9f8
75f5073
 
7b8b9f8
75f5073
7b8b9f8
75f5073
 
 
d544279
75f5073
 
 
7b8b9f8
75f5073
 
 
 
 
53fbd70
7b8b9f8
53fbd70
75f5073
7b8b9f8
 
 
 
d544279
75f5073
 
7b8b9f8
 
 
 
 
75f5073
7b8b9f8
d544279
75f5073
d544279
75f5073
7b8b9f8
 
d544279
75f5073
 
d544279
7b8b9f8
d544279
 
75f5073
 
7b8b9f8
 
75f5073
7b8b9f8
75f5073
 
 
7b8b9f8
d544279
7b8b9f8
 
75f5073
7b8b9f8
 
75f5073
d544279
 
75f5073
 
d544279
75f5073
 
 
 
 
d544279
 
75f5073
d544279
 
7b8b9f8
 
75f5073
7b8b9f8
75f5073
 
 
 
 
7b8b9f8
75f5073
7b8b9f8
 
 
 
d544279
75f5073
d544279
 
7b8b9f8
 
d544279
75f5073
d544279
 
7b8b9f8
 
d544279
 
 
53fbd70
d544279
75f5073
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
import gradio as gr
import requests
import json
import re
import xml.etree.ElementTree as ET
import numpy as np
import random
import hashlib
from datetime import datetime
from collections import defaultdict, Counter
import time

class QuestionAnsweringAI:
    def __init__(self):
        # Token database e vocabulary
        self.vocabulary = {}
        self.token_to_id = {}
        self.vocab_size = 0
        
        # Neural Network parameters
        self.embedding_dim = 256
        self.hidden_dim = 512
        self.context_length = 32
        
        # Knowledge systems
        self.knowledge_base = defaultdict(list)
        self.qa_patterns = defaultdict(list)
        self.context_memory = []
        
        # Network weights
        self.embeddings = None
        self.hidden_weights = None
        self.output_weights = None
        
        # Pattern storage
        self.bigram_counts = defaultdict(Counter)
        self.trigram_counts = defaultdict(Counter)
        self.sentence_starts = []
        
        # Data sources
        self.data_sources = {
            "news_rss": [
                "https://feeds.reuters.com/reuters/worldNews",
                "https://feeds.bbci.co.uk/news/world/rss.xml",
                "https://feeds.bbci.co.uk/news/technology/rss.xml"
            ]
        }
        
        # Training state
        self.total_tokens_collected = 0
        self.epochs_trained = 0
        self.learning_rate = 0.001
        self.max_response_length = 50
        
        self.initialize_network()
        
    def initialize_network(self):
        """Initialize neural network"""
        self.embeddings = np.random.normal(0, 0.1, (10000, self.embedding_dim))
        self.hidden_weights = np.random.normal(0, 0.1, (self.embedding_dim * self.context_length, self.hidden_dim))
        self.hidden_bias = np.zeros(self.hidden_dim)
        self.output_weights = np.random.normal(0, 0.1, (self.hidden_dim, 10000))
        self.output_bias = np.zeros(10000)
        print("🧠 Neural Network initialized")
    
    def collect_training_data(self, max_tokens=20000):
        """Collect training data from public sources"""
        print("πŸ•·οΈ Collecting Q&A training data...")
        
        collected_texts = []
        
        # Collect news data
        news_texts = self.scrape_news_feeds()
        collected_texts.extend(news_texts)
        print(f"πŸ“° Collected {len(news_texts)} news articles")
        
        # Create structured Q&A patterns
        qa_patterns = self.create_qa_patterns()
        collected_texts.extend(qa_patterns)
        print(f"❓ Generated {len(qa_patterns)} Q&A patterns")
        
        # Filter for quality
        quality_texts = [text for text in collected_texts if len(text) > 30]
        
        # Tokenize
        all_tokens = []
        for text in quality_texts:
            tokens = self.tokenize_text(text)
            all_tokens.extend(tokens)
            if len(all_tokens) >= max_tokens:
                break
        
        self.total_tokens_collected = len(all_tokens)
        print(f"🎯 Collected {self.total_tokens_collected:,} tokens")
        
        # Build systems
        self.build_vocabulary(all_tokens)
        self.build_knowledge_base(quality_texts)
        self.extract_patterns(all_tokens)
        
        return all_tokens
    
    def scrape_news_feeds(self):
        """Scrape news RSS feeds"""
        texts = []
        
        for rss_url in self.data_sources["news_rss"]:
            try:
                response = requests.get(rss_url, timeout=5)
                if response.status_code == 200:
                    root = ET.fromstring(response.content)
                    for item in root.findall(".//item")[:3]:
                        title = item.find("title")
                        description = item.find("description")
                        if title is not None:
                            text = title.text
                            if description is not None:
                                text += ". " + description.text
                            texts.append(self.clean_text(text))
            except:
                continue
        
        return texts
    
    def create_qa_patterns(self):
        """Create structured Q&A patterns"""
        patterns = []
        
        # Question-answer templates
        qa_templates = [
            ("What is artificial intelligence?", "Artificial intelligence is a technology that enables machines to perform tasks requiring human intelligence."),
            ("How do computers work?", "Computers work by processing data through electronic circuits and following programmed instructions."),
            ("Where is Paris located?", "Paris is located in France and serves as the capital city."),
            ("Why is education important?", "Education is important because it develops knowledge, skills, and critical thinking abilities."),
            ("What is machine learning?", "Machine learning is a subset of AI that allows systems to learn from data without explicit programming."),
            ("How does the internet work?", "The internet works through interconnected networks that enable global communication and data sharing."),
            ("What is climate change?", "Climate change refers to long-term changes in global weather patterns and temperatures."),
            ("Why do we need renewable energy?", "Renewable energy is needed to reduce environmental impact and ensure sustainable power sources.")
        ]
        
        for question, answer in qa_templates:
            pattern = f"Question: {question} Answer: {answer}"
            patterns.append(pattern)
        
        return patterns
    
    def clean_text(self, text):
        """Clean and normalize text"""
        if not text:
            return ""
        
        # Remove HTML tags and normalize
        text = re.sub(r'<[^>]+>', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\"\']+', ' ', text)
        
        return text.strip()
    
    def tokenize_text(self, text):
        """Tokenize text into tokens"""
        tokens = re.findall(r'\w+|[.!?;,]', text.lower())
        return tokens
    
    def build_vocabulary(self, tokens):
        """Build vocabulary from tokens"""
        token_counts = Counter(tokens)
        filtered_tokens = {token: count for token, count in token_counts.items() if count >= 2}
        
        vocab_list = ['<PAD>', '<UNK>', '<START>', '<END>'] + list(filtered_tokens.keys())
        
        self.vocabulary = {i: token for i, token in enumerate(vocab_list)}
        self.token_to_id = {token: i for i, token in enumerate(vocab_list)}
        self.vocab_size = len(vocab_list)
        
        print(f"πŸ“š Built vocabulary: {self.vocab_size:,} tokens")
    
    def build_knowledge_base(self, texts):
        """Build knowledge base from texts"""
        for text in texts:
            sentences = re.split(r'[.!?]+', text)
            for sentence in sentences:
                sentence = sentence.strip()
                if len(sentence) > 20:
                    # Extract main topic (simple approach)
                    words = sentence.split()
                    for word in words:
                        if word[0].isupper() and len(word) > 3:
                            topic = word.lower()
                            self.knowledge_base[topic].append(sentence)
                            break
    
    def extract_patterns(self, tokens):
        """Extract patterns for generation"""
        token_ids = [self.token_to_id.get(token, 1) for token in tokens]
        
        # Build bigrams
        for i in range(len(token_ids) - 1):
            current_token = token_ids[i]
            next_token = token_ids[i + 1]
            self.bigram_counts[current_token][next_token] += 1
        
        print(f"πŸ“Š Extracted {len(self.bigram_counts):,} bigram patterns")
    
    def train_system(self, training_tokens, epochs=3):
        """Train the Q&A system"""
        print(f"πŸŽ“ Training system for {epochs} epochs...")
        
        token_ids = [self.token_to_id.get(token, 1) for token in training_tokens]
        
        for epoch in range(epochs):
            print(f"Training epoch {epoch + 1}/{epochs}")
            
            # Simple training simulation
            total_batches = min(100, len(token_ids) // 10)
            
            for batch in range(total_batches):
                if batch % 25 == 0:
                    print(f"  Batch {batch + 1}/{total_batches}")
            
            self.epochs_trained += 1
        
        print("βœ… Training completed!")
    
    def answer_question(self, question):
        """Answer a question using trained knowledge"""
        if not question.strip():
            return "Hello! I'm an AI that learns from data. Ask me a question!"
        
        # Add to memory
        self.context_memory.append(question)
        if len(self.context_memory) > 5:
            self.context_memory.pop(0)
        
        # Classify question type
        question_type = self.classify_question(question)
        
        # Find relevant knowledge
        relevant_knowledge = self.find_relevant_knowledge(question)
        
        # Generate response
        response = self.generate_response(question, question_type, relevant_knowledge)
        
        return response
    
    def classify_question(self, question):
        """Classify question type"""
        question_lower = question.lower()
        
        if any(word in question_lower for word in ['what', 'define', 'explain']):
            return 'definition'
        elif any(word in question_lower for word in ['where', 'location']):
            return 'location'
        elif any(word in question_lower for word in ['how', 'method']):
            return 'process'
        elif any(word in question_lower for word in ['why', 'reason']):
            return 'explanation'
        else:
            return 'general'
    
    def find_relevant_knowledge(self, question):
        """Find relevant knowledge for question"""
        question_words = set(question.lower().split())
        relevant_facts = []
        
        for topic, facts in self.knowledge_base.items():
            if topic in question.lower():
                relevant_facts.extend(facts[:2])
        
        # Also search by word overlap
        for topic, facts in self.knowledge_base.items():
            for fact in facts:
                fact_words = set(fact.lower().split())
                overlap = len(question_words.intersection(fact_words))
                if overlap >= 2:
                    relevant_facts.append(fact)
                    if len(relevant_facts) >= 3:
                        break
        
        return relevant_facts[:3]
    
    def generate_response(self, question, question_type, knowledge):
        """Generate response using patterns and knowledge"""
        
        # Response templates
        templates = {
            'definition': "Based on my training data, this refers to",
            'location': "From geographical information I've learned,",
            'process': "According to technical sources,",
            'explanation': "The reason is that",
            'general': "From my knowledge base,"
        }
        
        starter = templates.get(question_type, "Based on what I've learned,")
        
        if knowledge:
            # Use relevant knowledge
            response = f"{starter} {knowledge[0][:150]}..."
            if len(knowledge) > 1:
                response += f" Additionally, {knowledge[1][:100]}..."
        else:
            # Fallback responses
            fallbacks = {
                'definition': f"{starter} a concept that involves multiple factors and considerations.",
                'location': f"{starter} this refers to a specific place or region.",
                'process': f"{starter} this involves a series of steps and procedures.",
                'explanation': f"{starter} multiple factors contribute to this.",
                'general': f"{starter} this is a topic with various aspects to consider."
            }
            response = fallbacks.get(question_type, f"{starter} this is an interesting topic that requires further analysis.")
        
        # Ensure proper ending
        if not response.endswith('.'):
            response += '.'
        
        return response[:300]  # Limit response length
    
    def get_stats(self):
        """Get system statistics"""
        return {
            "tokens_collected": self.total_tokens_collected,
            "vocabulary_size": self.vocab_size,
            "epochs_trained": self.epochs_trained,
            "knowledge_topics": len(self.knowledge_base),
            "bigram_patterns": len(self.bigram_counts),
            "memory_items": len(self.context_memory)
        }

# Initialize system
qa_system = QuestionAnsweringAI()

def train_qa_system():
    """Train the Q&A system"""
    try:
        # Collect data
        tokens = qa_system.collect_training_data(max_tokens=15000)
        
        if len(tokens) > 50:
            # Train system
            qa_system.train_system(tokens, epochs=2)
            return "βœ… Q&A System training completed successfully!"
        else:
            return "❌ Insufficient data collected for training"
    except Exception as e:
        return f"❌ Training error: {str(e)}"

def chat_with_ai(message, history):
    """Chat interface function"""
    if not message.strip():
        response = "Hi! I'm an AI that learns from data and answers questions. What would you like to know?"
    else:
        response = qa_system.answer_question(message)
    
    history.append([message, response])
    return history, ""

def get_system_status():
    """Get current system status"""
    stats = qa_system.get_stats()
    
    status = "πŸ€– **QUESTION ANSWERING AI STATUS**\n\n"
    
    if stats['tokens_collected'] == 0:
        status += "⏳ **System not trained yet**\nClick 'Start Training' to begin\n\n"
    else:
        status += "βœ… **System trained and operational**\n\n"
    
    status += "**πŸ“Š Statistics:**\n"
    status += f"β€’ **Tokens collected:** {stats['tokens_collected']:,}\n"
    status += f"β€’ **Vocabulary size:** {stats['vocabulary_size']:,}\n"
    status += f"β€’ **Knowledge topics:** {stats['knowledge_topics']:,}\n"
    status += f"β€’ **Training epochs:** {stats['epochs_trained']}\n"
    status += f"β€’ **Pattern database:** {stats['bigram_patterns']:,} patterns\n"
    status += f"β€’ **Conversation memory:** {stats['memory_items']} messages\n"
    
    status += "\n**🎯 Capabilities:**\n"
    status += "β€’ Answers questions using learned knowledge\n"
    status += "β€’ Processes natural language queries\n"
    status += "β€’ Maintains conversation context\n"
    status += "β€’ Uses pattern matching for responses\n"
    
    return status

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    
    gr.HTML("""
    <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;">
        <h1>πŸ€– Question Answering AI</h1>
        <p><b>AI that learns from data and answers questions</b></p>
        <p>Collects tokens from internet β†’ Organizes neural patterns β†’ Generates intelligent responses</p>
    </div>
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            gr.HTML("<h3>πŸ’¬ Chat with AI</h3>")
            
            chatbot = gr.Chatbot(
                label="Question Answering AI Chat",
                height=400,
                show_label=True
            )
            
            msg_input = gr.Textbox(
                label="Your question",
                placeholder="Ask me anything: What is AI? How does technology work?",
                lines=2
            )
            
            with gr.Row():
                send_btn = gr.Button("πŸ’¬ Send", variant="primary")
                clear_btn = gr.Button("πŸ”„ Clear", variant="secondary")
        
        with gr.Column(scale=1):
            gr.HTML("<h3>βš™οΈ System Status</h3>")
            
            status_output = gr.Textbox(
                label="System Status",
                lines=18,
                interactive=False,
                value=get_system_status()
            )
            
            train_btn = gr.Button("πŸš€ Start Training", variant="secondary")
            refresh_btn = gr.Button("πŸ”„ Refresh Status", variant="secondary")
    
    # Example questions
    gr.Examples(
        examples=[
            "What is artificial intelligence?",
            "How do computers work?",
            "Where is Paris located?",
            "Why is education important?",
            "Explain machine learning",
            "How does the internet work?",
            "What is climate change?",
            "Why do we need renewable energy?"
        ],
        inputs=msg_input,
        label="🎯 Example Questions"
    )
    
    gr.HTML("""
    <div style="margin-top: 20px; padding: 15px; background-color: #f0f0f0; border-radius: 8px;">
        <h4>🧠 How It Works:</h4>
        <ol>
            <li><b>Data Collection:</b> Gathers text from news feeds and creates Q&A patterns</li>
            <li><b>Knowledge Building:</b> Extracts facts and builds searchable knowledge base</li>
            <li><b>Pattern Learning:</b> Learns language patterns from collected data</li>
            <li><b>Question Processing:</b> Classifies questions and finds relevant knowledge</li>
            <li><b>Response Generation:</b> Creates intelligent answers using learned patterns</li>
        </ol>
        <p><b>🎯 Result:</b> An AI that can answer questions using knowledge learned from data!</p>
    </div>
    """)
    
    # Event handlers
    send_btn.click(
        chat_with_ai,
        inputs=[msg_input, chatbot],
        outputs=[chatbot, msg_input]
    )
    
    msg_input.submit(
        chat_with_ai,
        inputs=[msg_input, chatbot],
        outputs=[chatbot, msg_input]
    )
    
    clear_btn.click(
        lambda: ([], ""),
        outputs=[chatbot, msg_input]
    )
    
    train_btn.click(
        train_qa_system,
        outputs=[status_output]
    )
    
    refresh_btn.click(
        get_system_status,
        outputs=[status_output]
    )

if __name__ == "__main__":
    demo.launch()