NoahsKI / google_search_learner.py
noah33565's picture
Upload 221 files
8d3de43 verified
"""
GOOGLE SEARCH INTEGRATION & WEB LEARNING
Enables self-learning through web search, article parsing, and knowledge extraction
"""
import logging
from typing import Dict, List, Optional, Any
import json
import os
from urllib.parse import quote
import re
logger = logging.getLogger(__name__)
class GoogleSearchLearner:
"""
Integrates Google search for self-learning
Can search, parse, and learn from web content
"""
def __init__(self):
self.search_cache = {}
self.learned_knowledge = {}
self.search_history = []
self.load_knowledge_cache()
def search_and_learn(self, query: str, num_results: int = 5) -> Dict:
"""
Search Google and learn from results
(In production, this would use Google Custom Search API or similar)
"""
logger.info(f"Searching for: {query}")
# Check cache first
cache_key = query.lower().strip()
if cache_key in self.search_cache:
logger.info(f"Using cached results for: {query}")
return self.search_cache[cache_key]
# In production, use actual API
# For now, return informative structure
search_result = {
'query': query,
'success': False,
'results': [],
'error': 'Google Search API not configured'
}
# If we had the API, we would do:
# results = self._perform_google_search(query, num_results)
self.search_history.append({
'query': query,
'timestamp': self._get_timestamp(),
'result_count': len(search_result['results'])
})
return search_result
def extract_knowledge_from_text(self, text: str, topic: str = '') -> Dict:
"""Extract structured knowledge from text"""
knowledge = {
'topic': topic,
'key_points': [],
'definitions': {},
'relationships': [],
'summary': '',
'learning_confidence': 0.0
}
# Extract sentences that might be definitions
definitions = self._extract_definitions(text)
knowledge['definitions'] = definitions
# Extract key points
key_points = self._extract_key_points(text)
knowledge['key_points'] = key_points
# Extract relationships
relationships = self._extract_relationships(text, topic)
knowledge['relationships'] = relationships
# Generate summary
summary = self._generate_summary(text, num_sentences=3)
knowledge['summary'] = summary
# Calculate confidence
knowledge['learning_confidence'] = self._calculate_confidence(knowledge)
# Store in knowledge base
self._store_knowledge(topic, knowledge)
return knowledge
def _extract_definitions(self, text: str) -> Dict[str, str]:
"""Extract potential definitions from text"""
definitions = {}
# Look for patterns like "X is Y" or "X are Y"
patterns = [
r'(\w+)\s+(?:is|are)\s+(?:a|an)?\s+([^.!?]+)',
r'(\w+):\s+([^.!?]+)',
r'(\w+)\s+=\s+([^.!?]+)',
]
for pattern in patterns:
matches = re.findall(pattern, text)
for key, value in matches:
if len(key.split()) <= 3 and len(value) < 200:
definitions[key.strip()] = value.strip()
return definitions
def _extract_key_points(self, text: str) -> List[str]:
"""Extract key points from text"""
key_points = []
# Split into sentences
sentences = re.split(r'[.!?]+', text)
# Get sentences with keywords
keywords = ['important', 'key', 'significant', 'note', 'must', 'should', 'critical']
for sentence in sentences:
if any(kw in sentence.lower() for kw in keywords):
cleaned = sentence.strip()
if cleaned and len(cleaned) > 10:
key_points.append(cleaned)
return key_points[:10] # Limit to 10 key points
def _extract_relationships(self, text: str, main_topic: str) -> List[Dict]:
"""Extract relationships between concepts"""
relationships = []
# Simple relationship extraction
patterns = [
r'(\w+)\s+leads to\s+(\w+)',
r'(\w+)\s+causes\s+(\w+)',
r'(\w+)\s+related to\s+(\w+)',
r'(\w+)\s+affects\s+(\w+)',
]
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for source, target in matches:
relationships.append({
'source': source,
'target': target,
'main_topic': main_topic,
'type': 'connection'
})
return relationships
def _generate_summary(self, text: str, num_sentences: int = 3) -> str:
"""Generate summary of text"""
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
# For now, just take first few sentences
summary_sentences = sentences[:num_sentences]
return '. '.join(summary_sentences) + '.'
def _calculate_confidence(self, knowledge: Dict) -> float:
"""Calculate how confident we are in learned knowledge"""
confidence = 0.0
# More definitions = more confident
if knowledge['definitions']:
confidence += min(0.3, len(knowledge['definitions']) * 0.05)
# More key points = more confident
if knowledge['key_points']:
confidence += min(0.3, len(knowledge['key_points']) * 0.05)
# More relationships = more confident
if knowledge['relationships']:
confidence += min(0.4, len(knowledge['relationships']) * 0.05)
return min(1.0, confidence)
def _store_knowledge(self, topic: str, knowledge: Dict):
"""Store learned knowledge"""
if topic not in self.learned_knowledge:
self.learned_knowledge[topic] = []
self.learned_knowledge[topic].append({
'knowledge': knowledge,
'timestamp': self._get_timestamp(),
'confidence': knowledge['learning_confidence']
})
logger.info(f"Stored knowledge for topic: {topic}")
def get_learned_knowledge(self, topic: str) -> Optional[Dict]:
"""Retrieve learned knowledge about a topic"""
if topic not in self.learned_knowledge or not self.learned_knowledge[topic]:
return None
# Return highest confidence knowledge
best = max(
self.learned_knowledge[topic],
key=lambda x: x['confidence']
)
return best['knowledge']
def search_and_find_answer(self, question: str) -> Dict:
"""Search for answer to a question"""
logger.info(f"Searching answer for: {question}")
# Extract key terms from question
key_terms = self._extract_key_terms(question)
# Search for each term
search_results = []
for term in key_terms[:3]: # Limit to top 3 terms
result = self.search_and_learn(term, num_results=3)
if result.get('results'):
search_results.extend(result['results'])
# Synthesize answer
answer = self._synthesize_answer(question, search_results)
return {
'question': question,
'answer': answer,
'search_terms': key_terms,
'sources': len(search_results),
'confidence': len(search_results) / max(1, len(key_terms))
}
def _extract_key_terms(self, question: str) -> List[str]:
"""Extract key search terms from question"""
# Remove question words
question_words = ['what', 'how', 'why', 'when', 'where', 'who', 'which', 'is', 'are', 'the', 'a', 'an']
words = question.lower().split()
key_terms = [w for w in words if w not in question_words and len(w) > 2]
return key_terms
def _synthesize_answer(self, question: str, search_results: List) -> str:
"""Synthesize answer from search results"""
if not search_results:
return f"No information found for: {question}"
# Build answer from results
answer_parts = []
for result in search_results[:3]:
if isinstance(result, dict) and 'snippet' in result:
answer_parts.append(result['snippet'])
if answer_parts:
return ' '.join(answer_parts)
else:
return "Information found but synthesis incomplete."
def _get_timestamp(self) -> str:
"""Get current timestamp"""
from datetime import datetime
return datetime.now().isoformat()
def save_knowledge_cache(self):
"""Save learned knowledge to file"""
try:
os.makedirs('noahski_data', exist_ok=True)
cache_file = 'noahski_data/google_learned_knowledge.json'
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump({
'learned_knowledge': self.learned_knowledge,
'search_cache': self.search_cache,
'search_history': self.search_history,
}, f, indent=2, ensure_ascii=False)
logger.info(f"Saved knowledge cache: {len(self.learned_knowledge)} topics")
except Exception as e:
logger.error(f"Error saving knowledge cache: {e}")
def load_knowledge_cache(self):
"""Load learned knowledge from file"""
try:
cache_file = 'noahski_data/google_learned_knowledge.json'
if os.path.exists(cache_file):
with open(cache_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self.learned_knowledge = data.get('learned_knowledge', {})
self.search_cache = data.get('search_cache', {})
self.search_history = data.get('search_history', [])
logger.info(f"Loaded knowledge cache: {len(self.learned_knowledge)} topics")
except Exception as e:
logger.error(f"Error loading knowledge cache: {e}")
def get_learning_stats(self) -> Dict:
"""Get learning statistics"""
return {
'total_searches': len(self.search_history),
'topics_learned': len(self.learned_knowledge),
'cached_results': len(self.search_cache),
'average_confidence': sum(
max(k['confidence'] for k in v)
for v in self.learned_knowledge.values() if v
) / max(1, len(self.learned_knowledge)),
'total_knowledge_points': sum(
len(k.get('knowledge', {}).get('key_points', []))
for v in self.learned_knowledge.values()
for k in v
)
}
# Global instance
_google_learner = None
def get_google_learner() -> GoogleSearchLearner:
"""Get or create global Google search learner"""
global _google_learner
if _google_learner is None:
_google_learner = GoogleSearchLearner()
return _google_learner