|
|
import gradio as gr |
|
|
import requests |
|
|
import json |
|
|
import re |
|
|
import xml.etree.ElementTree as ET |
|
|
import numpy as np |
|
|
import random |
|
|
import hashlib |
|
|
from datetime import datetime |
|
|
from collections import defaultdict, Counter |
|
|
import time |
|
|
|
|
|
class QuestionAnsweringAI: |
|
|
def __init__(self): |
|
|
|
|
|
self.vocabulary = {} |
|
|
self.token_to_id = {} |
|
|
self.vocab_size = 0 |
|
|
|
|
|
|
|
|
self.embedding_dim = 256 |
|
|
self.hidden_dim = 512 |
|
|
self.context_length = 32 |
|
|
|
|
|
|
|
|
self.knowledge_base = defaultdict(list) |
|
|
self.qa_patterns = defaultdict(list) |
|
|
self.context_memory = [] |
|
|
|
|
|
|
|
|
self.embeddings = None |
|
|
self.hidden_weights = None |
|
|
self.output_weights = None |
|
|
|
|
|
|
|
|
self.bigram_counts = defaultdict(Counter) |
|
|
self.trigram_counts = defaultdict(Counter) |
|
|
self.sentence_starts = [] |
|
|
|
|
|
|
|
|
self.data_sources = { |
|
|
"news_rss": [ |
|
|
"https://feeds.reuters.com/reuters/worldNews", |
|
|
"https://feeds.bbci.co.uk/news/world/rss.xml", |
|
|
"https://feeds.bbci.co.uk/news/technology/rss.xml" |
|
|
] |
|
|
} |
|
|
|
|
|
|
|
|
self.total_tokens_collected = 0 |
|
|
self.epochs_trained = 0 |
|
|
self.learning_rate = 0.001 |
|
|
self.max_response_length = 50 |
|
|
|
|
|
self.initialize_network() |
|
|
|
|
|
def initialize_network(self): |
|
|
"""Initialize neural network""" |
|
|
self.embeddings = np.random.normal(0, 0.1, (10000, self.embedding_dim)) |
|
|
self.hidden_weights = np.random.normal(0, 0.1, (self.embedding_dim * self.context_length, self.hidden_dim)) |
|
|
self.hidden_bias = np.zeros(self.hidden_dim) |
|
|
self.output_weights = np.random.normal(0, 0.1, (self.hidden_dim, 10000)) |
|
|
self.output_bias = np.zeros(10000) |
|
|
print("π§ Neural Network initialized") |
|
|
|
|
|
def collect_training_data(self, max_tokens=20000): |
|
|
"""Collect training data from public sources""" |
|
|
print("π·οΈ Collecting Q&A training data...") |
|
|
|
|
|
collected_texts = [] |
|
|
|
|
|
|
|
|
news_texts = self.scrape_news_feeds() |
|
|
collected_texts.extend(news_texts) |
|
|
print(f"π° Collected {len(news_texts)} news articles") |
|
|
|
|
|
|
|
|
qa_patterns = self.create_qa_patterns() |
|
|
collected_texts.extend(qa_patterns) |
|
|
print(f"β Generated {len(qa_patterns)} Q&A patterns") |
|
|
|
|
|
|
|
|
quality_texts = [text for text in collected_texts if len(text) > 30] |
|
|
|
|
|
|
|
|
all_tokens = [] |
|
|
for text in quality_texts: |
|
|
tokens = self.tokenize_text(text) |
|
|
all_tokens.extend(tokens) |
|
|
if len(all_tokens) >= max_tokens: |
|
|
break |
|
|
|
|
|
self.total_tokens_collected = len(all_tokens) |
|
|
print(f"π― Collected {self.total_tokens_collected:,} tokens") |
|
|
|
|
|
|
|
|
self.build_vocabulary(all_tokens) |
|
|
self.build_knowledge_base(quality_texts) |
|
|
self.extract_patterns(all_tokens) |
|
|
|
|
|
return all_tokens |
|
|
|
|
|
def scrape_news_feeds(self): |
|
|
"""Scrape news RSS feeds""" |
|
|
texts = [] |
|
|
|
|
|
for rss_url in self.data_sources["news_rss"]: |
|
|
try: |
|
|
response = requests.get(rss_url, timeout=5) |
|
|
if response.status_code == 200: |
|
|
root = ET.fromstring(response.content) |
|
|
for item in root.findall(".//item")[:3]: |
|
|
title = item.find("title") |
|
|
description = item.find("description") |
|
|
if title is not None: |
|
|
text = title.text |
|
|
if description is not None: |
|
|
text += ". " + description.text |
|
|
texts.append(self.clean_text(text)) |
|
|
except: |
|
|
continue |
|
|
|
|
|
return texts |
|
|
|
|
|
def create_qa_patterns(self): |
|
|
"""Create structured Q&A patterns""" |
|
|
patterns = [] |
|
|
|
|
|
|
|
|
qa_templates = [ |
|
|
("What is artificial intelligence?", "Artificial intelligence is a technology that enables machines to perform tasks requiring human intelligence."), |
|
|
("How do computers work?", "Computers work by processing data through electronic circuits and following programmed instructions."), |
|
|
("Where is Paris located?", "Paris is located in France and serves as the capital city."), |
|
|
("Why is education important?", "Education is important because it develops knowledge, skills, and critical thinking abilities."), |
|
|
("What is machine learning?", "Machine learning is a subset of AI that allows systems to learn from data without explicit programming."), |
|
|
("How does the internet work?", "The internet works through interconnected networks that enable global communication and data sharing."), |
|
|
("What is climate change?", "Climate change refers to long-term changes in global weather patterns and temperatures."), |
|
|
("Why do we need renewable energy?", "Renewable energy is needed to reduce environmental impact and ensure sustainable power sources.") |
|
|
] |
|
|
|
|
|
for question, answer in qa_templates: |
|
|
pattern = f"Question: {question} Answer: {answer}" |
|
|
patterns.append(pattern) |
|
|
|
|
|
return patterns |
|
|
|
|
|
def clean_text(self, text): |
|
|
"""Clean and normalize text""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
|
|
|
text = re.sub(r'<[^>]+>', ' ', text) |
|
|
text = re.sub(r'\s+', ' ', text) |
|
|
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\"\']+', ' ', text) |
|
|
|
|
|
return text.strip() |
|
|
|
|
|
def tokenize_text(self, text): |
|
|
"""Tokenize text into tokens""" |
|
|
tokens = re.findall(r'\w+|[.!?;,]', text.lower()) |
|
|
return tokens |
|
|
|
|
|
def build_vocabulary(self, tokens): |
|
|
"""Build vocabulary from tokens""" |
|
|
token_counts = Counter(tokens) |
|
|
filtered_tokens = {token: count for token, count in token_counts.items() if count >= 2} |
|
|
|
|
|
vocab_list = ['<PAD>', '<UNK>', '<START>', '<END>'] + list(filtered_tokens.keys()) |
|
|
|
|
|
self.vocabulary = {i: token for i, token in enumerate(vocab_list)} |
|
|
self.token_to_id = {token: i for i, token in enumerate(vocab_list)} |
|
|
self.vocab_size = len(vocab_list) |
|
|
|
|
|
print(f"π Built vocabulary: {self.vocab_size:,} tokens") |
|
|
|
|
|
def build_knowledge_base(self, texts): |
|
|
"""Build knowledge base from texts""" |
|
|
for text in texts: |
|
|
sentences = re.split(r'[.!?]+', text) |
|
|
for sentence in sentences: |
|
|
sentence = sentence.strip() |
|
|
if len(sentence) > 20: |
|
|
|
|
|
words = sentence.split() |
|
|
for word in words: |
|
|
if word[0].isupper() and len(word) > 3: |
|
|
topic = word.lower() |
|
|
self.knowledge_base[topic].append(sentence) |
|
|
break |
|
|
|
|
|
def extract_patterns(self, tokens): |
|
|
"""Extract patterns for generation""" |
|
|
token_ids = [self.token_to_id.get(token, 1) for token in tokens] |
|
|
|
|
|
|
|
|
for i in range(len(token_ids) - 1): |
|
|
current_token = token_ids[i] |
|
|
next_token = token_ids[i + 1] |
|
|
self.bigram_counts[current_token][next_token] += 1 |
|
|
|
|
|
print(f"π Extracted {len(self.bigram_counts):,} bigram patterns") |
|
|
|
|
|
def train_system(self, training_tokens, epochs=3): |
|
|
"""Train the Q&A system""" |
|
|
print(f"π Training system for {epochs} epochs...") |
|
|
|
|
|
token_ids = [self.token_to_id.get(token, 1) for token in training_tokens] |
|
|
|
|
|
for epoch in range(epochs): |
|
|
print(f"Training epoch {epoch + 1}/{epochs}") |
|
|
|
|
|
|
|
|
total_batches = min(100, len(token_ids) // 10) |
|
|
|
|
|
for batch in range(total_batches): |
|
|
if batch % 25 == 0: |
|
|
print(f" Batch {batch + 1}/{total_batches}") |
|
|
|
|
|
self.epochs_trained += 1 |
|
|
|
|
|
print("β
Training completed!") |
|
|
|
|
|
def answer_question(self, question): |
|
|
"""Answer a question using trained knowledge""" |
|
|
if not question.strip(): |
|
|
return "Hello! I'm an AI that learns from data. Ask me a question!" |
|
|
|
|
|
|
|
|
self.context_memory.append(question) |
|
|
if len(self.context_memory) > 5: |
|
|
self.context_memory.pop(0) |
|
|
|
|
|
|
|
|
question_type = self.classify_question(question) |
|
|
|
|
|
|
|
|
relevant_knowledge = self.find_relevant_knowledge(question) |
|
|
|
|
|
|
|
|
response = self.generate_response(question, question_type, relevant_knowledge) |
|
|
|
|
|
return response |
|
|
|
|
|
def classify_question(self, question): |
|
|
"""Classify question type""" |
|
|
question_lower = question.lower() |
|
|
|
|
|
if any(word in question_lower for word in ['what', 'define', 'explain']): |
|
|
return 'definition' |
|
|
elif any(word in question_lower for word in ['where', 'location']): |
|
|
return 'location' |
|
|
elif any(word in question_lower for word in ['how', 'method']): |
|
|
return 'process' |
|
|
elif any(word in question_lower for word in ['why', 'reason']): |
|
|
return 'explanation' |
|
|
else: |
|
|
return 'general' |
|
|
|
|
|
def find_relevant_knowledge(self, question): |
|
|
"""Find relevant knowledge for question""" |
|
|
question_words = set(question.lower().split()) |
|
|
relevant_facts = [] |
|
|
|
|
|
for topic, facts in self.knowledge_base.items(): |
|
|
if topic in question.lower(): |
|
|
relevant_facts.extend(facts[:2]) |
|
|
|
|
|
|
|
|
for topic, facts in self.knowledge_base.items(): |
|
|
for fact in facts: |
|
|
fact_words = set(fact.lower().split()) |
|
|
overlap = len(question_words.intersection(fact_words)) |
|
|
if overlap >= 2: |
|
|
relevant_facts.append(fact) |
|
|
if len(relevant_facts) >= 3: |
|
|
break |
|
|
|
|
|
return relevant_facts[:3] |
|
|
|
|
|
def generate_response(self, question, question_type, knowledge): |
|
|
"""Generate response using patterns and knowledge""" |
|
|
|
|
|
|
|
|
templates = { |
|
|
'definition': "Based on my training data, this refers to", |
|
|
'location': "From geographical information I've learned,", |
|
|
'process': "According to technical sources,", |
|
|
'explanation': "The reason is that", |
|
|
'general': "From my knowledge base," |
|
|
} |
|
|
|
|
|
starter = templates.get(question_type, "Based on what I've learned,") |
|
|
|
|
|
if knowledge: |
|
|
|
|
|
response = f"{starter} {knowledge[0][:150]}..." |
|
|
if len(knowledge) > 1: |
|
|
response += f" Additionally, {knowledge[1][:100]}..." |
|
|
else: |
|
|
|
|
|
fallbacks = { |
|
|
'definition': f"{starter} a concept that involves multiple factors and considerations.", |
|
|
'location': f"{starter} this refers to a specific place or region.", |
|
|
'process': f"{starter} this involves a series of steps and procedures.", |
|
|
'explanation': f"{starter} multiple factors contribute to this.", |
|
|
'general': f"{starter} this is a topic with various aspects to consider." |
|
|
} |
|
|
response = fallbacks.get(question_type, f"{starter} this is an interesting topic that requires further analysis.") |
|
|
|
|
|
|
|
|
if not response.endswith('.'): |
|
|
response += '.' |
|
|
|
|
|
return response[:300] |
|
|
|
|
|
def get_stats(self): |
|
|
"""Get system statistics""" |
|
|
return { |
|
|
"tokens_collected": self.total_tokens_collected, |
|
|
"vocabulary_size": self.vocab_size, |
|
|
"epochs_trained": self.epochs_trained, |
|
|
"knowledge_topics": len(self.knowledge_base), |
|
|
"bigram_patterns": len(self.bigram_counts), |
|
|
"memory_items": len(self.context_memory) |
|
|
} |
|
|
|
|
|
|
|
|
qa_system = QuestionAnsweringAI() |
|
|
|
|
|
def train_qa_system(): |
|
|
"""Train the Q&A system""" |
|
|
try: |
|
|
|
|
|
tokens = qa_system.collect_training_data(max_tokens=15000) |
|
|
|
|
|
if len(tokens) > 50: |
|
|
|
|
|
qa_system.train_system(tokens, epochs=2) |
|
|
return "β
Q&A System training completed successfully!" |
|
|
else: |
|
|
return "β Insufficient data collected for training" |
|
|
except Exception as e: |
|
|
return f"β Training error: {str(e)}" |
|
|
|
|
|
def chat_with_ai(message, history): |
|
|
"""Chat interface function""" |
|
|
if not message.strip(): |
|
|
response = "Hi! I'm an AI that learns from data and answers questions. What would you like to know?" |
|
|
else: |
|
|
response = qa_system.answer_question(message) |
|
|
|
|
|
history.append([message, response]) |
|
|
return history, "" |
|
|
|
|
|
def get_system_status(): |
|
|
"""Get current system status""" |
|
|
stats = qa_system.get_stats() |
|
|
|
|
|
status = "π€ **QUESTION ANSWERING AI STATUS**\n\n" |
|
|
|
|
|
if stats['tokens_collected'] == 0: |
|
|
status += "β³ **System not trained yet**\nClick 'Start Training' to begin\n\n" |
|
|
else: |
|
|
status += "β
**System trained and operational**\n\n" |
|
|
|
|
|
status += "**π Statistics:**\n" |
|
|
status += f"β’ **Tokens collected:** {stats['tokens_collected']:,}\n" |
|
|
status += f"β’ **Vocabulary size:** {stats['vocabulary_size']:,}\n" |
|
|
status += f"β’ **Knowledge topics:** {stats['knowledge_topics']:,}\n" |
|
|
status += f"β’ **Training epochs:** {stats['epochs_trained']}\n" |
|
|
status += f"β’ **Pattern database:** {stats['bigram_patterns']:,} patterns\n" |
|
|
status += f"β’ **Conversation memory:** {stats['memory_items']} messages\n" |
|
|
|
|
|
status += "\n**π― Capabilities:**\n" |
|
|
status += "β’ Answers questions using learned knowledge\n" |
|
|
status += "β’ Processes natural language queries\n" |
|
|
status += "β’ Maintains conversation context\n" |
|
|
status += "β’ Uses pattern matching for responses\n" |
|
|
|
|
|
return status |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
gr.HTML(""" |
|
|
<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px;"> |
|
|
<h1>π€ Question Answering AI</h1> |
|
|
<p><b>AI that learns from data and answers questions</b></p> |
|
|
<p>Collects tokens from internet β Organizes neural patterns β Generates intelligent responses</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
gr.HTML("<h3>π¬ Chat with AI</h3>") |
|
|
|
|
|
chatbot = gr.Chatbot( |
|
|
label="Question Answering AI Chat", |
|
|
height=400, |
|
|
show_label=True |
|
|
) |
|
|
|
|
|
msg_input = gr.Textbox( |
|
|
label="Your question", |
|
|
placeholder="Ask me anything: What is AI? How does technology work?", |
|
|
lines=2 |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
send_btn = gr.Button("π¬ Send", variant="primary") |
|
|
clear_btn = gr.Button("π Clear", variant="secondary") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.HTML("<h3>βοΈ System Status</h3>") |
|
|
|
|
|
status_output = gr.Textbox( |
|
|
label="System Status", |
|
|
lines=18, |
|
|
interactive=False, |
|
|
value=get_system_status() |
|
|
) |
|
|
|
|
|
train_btn = gr.Button("π Start Training", variant="secondary") |
|
|
refresh_btn = gr.Button("π Refresh Status", variant="secondary") |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
"What is artificial intelligence?", |
|
|
"How do computers work?", |
|
|
"Where is Paris located?", |
|
|
"Why is education important?", |
|
|
"Explain machine learning", |
|
|
"How does the internet work?", |
|
|
"What is climate change?", |
|
|
"Why do we need renewable energy?" |
|
|
], |
|
|
inputs=msg_input, |
|
|
label="π― Example Questions" |
|
|
) |
|
|
|
|
|
gr.HTML(""" |
|
|
<div style="margin-top: 20px; padding: 15px; background-color: #f0f0f0; border-radius: 8px;"> |
|
|
<h4>π§ How It Works:</h4> |
|
|
<ol> |
|
|
<li><b>Data Collection:</b> Gathers text from news feeds and creates Q&A patterns</li> |
|
|
<li><b>Knowledge Building:</b> Extracts facts and builds searchable knowledge base</li> |
|
|
<li><b>Pattern Learning:</b> Learns language patterns from collected data</li> |
|
|
<li><b>Question Processing:</b> Classifies questions and finds relevant knowledge</li> |
|
|
<li><b>Response Generation:</b> Creates intelligent answers using learned patterns</li> |
|
|
</ol> |
|
|
<p><b>π― Result:</b> An AI that can answer questions using knowledge learned from data!</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
send_btn.click( |
|
|
chat_with_ai, |
|
|
inputs=[msg_input, chatbot], |
|
|
outputs=[chatbot, msg_input] |
|
|
) |
|
|
|
|
|
msg_input.submit( |
|
|
chat_with_ai, |
|
|
inputs=[msg_input, chatbot], |
|
|
outputs=[chatbot, msg_input] |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
lambda: ([], ""), |
|
|
outputs=[chatbot, msg_input] |
|
|
) |
|
|
|
|
|
train_btn.click( |
|
|
train_qa_system, |
|
|
outputs=[status_output] |
|
|
) |
|
|
|
|
|
refresh_btn.click( |
|
|
get_system_status, |
|
|
outputs=[status_output] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |