| """
|
| GOOGLE SEARCH INTEGRATION & WEB LEARNING
|
| Enables self-learning through web search, article parsing, and knowledge extraction
|
| """
|
|
|
| import logging
|
| from typing import Dict, List, Optional, Any
|
| import json
|
| import os
|
| from urllib.parse import quote
|
| import re
|
|
|
| logger = logging.getLogger(__name__)
|
|
|
| class GoogleSearchLearner:
|
| """
|
| Integrates Google search for self-learning
|
| Can search, parse, and learn from web content
|
| """
|
|
|
| def __init__(self):
|
| self.search_cache = {}
|
| self.learned_knowledge = {}
|
| self.search_history = []
|
| self.load_knowledge_cache()
|
|
|
| def search_and_learn(self, query: str, num_results: int = 5) -> Dict:
|
| """
|
| Search Google and learn from results
|
| (In production, this would use Google Custom Search API or similar)
|
| """
|
|
|
| logger.info(f"Searching for: {query}")
|
|
|
|
|
| cache_key = query.lower().strip()
|
| if cache_key in self.search_cache:
|
| logger.info(f"Using cached results for: {query}")
|
| return self.search_cache[cache_key]
|
|
|
|
|
|
|
| search_result = {
|
| 'query': query,
|
| 'success': False,
|
| 'results': [],
|
| 'error': 'Google Search API not configured'
|
| }
|
|
|
|
|
|
|
|
|
| self.search_history.append({
|
| 'query': query,
|
| 'timestamp': self._get_timestamp(),
|
| 'result_count': len(search_result['results'])
|
| })
|
|
|
| return search_result
|
|
|
| def extract_knowledge_from_text(self, text: str, topic: str = '') -> Dict:
|
| """Extract structured knowledge from text"""
|
|
|
| knowledge = {
|
| 'topic': topic,
|
| 'key_points': [],
|
| 'definitions': {},
|
| 'relationships': [],
|
| 'summary': '',
|
| 'learning_confidence': 0.0
|
| }
|
|
|
|
|
| definitions = self._extract_definitions(text)
|
| knowledge['definitions'] = definitions
|
|
|
|
|
| key_points = self._extract_key_points(text)
|
| knowledge['key_points'] = key_points
|
|
|
|
|
| relationships = self._extract_relationships(text, topic)
|
| knowledge['relationships'] = relationships
|
|
|
|
|
| summary = self._generate_summary(text, num_sentences=3)
|
| knowledge['summary'] = summary
|
|
|
|
|
| knowledge['learning_confidence'] = self._calculate_confidence(knowledge)
|
|
|
|
|
| self._store_knowledge(topic, knowledge)
|
|
|
| return knowledge
|
|
|
| def _extract_definitions(self, text: str) -> Dict[str, str]:
|
| """Extract potential definitions from text"""
|
| definitions = {}
|
|
|
|
|
| patterns = [
|
| r'(\w+)\s+(?:is|are)\s+(?:a|an)?\s+([^.!?]+)',
|
| r'(\w+):\s+([^.!?]+)',
|
| r'(\w+)\s+=\s+([^.!?]+)',
|
| ]
|
|
|
| for pattern in patterns:
|
| matches = re.findall(pattern, text)
|
| for key, value in matches:
|
| if len(key.split()) <= 3 and len(value) < 200:
|
| definitions[key.strip()] = value.strip()
|
|
|
| return definitions
|
|
|
| def _extract_key_points(self, text: str) -> List[str]:
|
| """Extract key points from text"""
|
| key_points = []
|
|
|
|
|
| sentences = re.split(r'[.!?]+', text)
|
|
|
|
|
| keywords = ['important', 'key', 'significant', 'note', 'must', 'should', 'critical']
|
|
|
| for sentence in sentences:
|
| if any(kw in sentence.lower() for kw in keywords):
|
| cleaned = sentence.strip()
|
| if cleaned and len(cleaned) > 10:
|
| key_points.append(cleaned)
|
|
|
| return key_points[:10]
|
|
|
| def _extract_relationships(self, text: str, main_topic: str) -> List[Dict]:
|
| """Extract relationships between concepts"""
|
| relationships = []
|
|
|
|
|
| patterns = [
|
| r'(\w+)\s+leads to\s+(\w+)',
|
| r'(\w+)\s+causes\s+(\w+)',
|
| r'(\w+)\s+related to\s+(\w+)',
|
| r'(\w+)\s+affects\s+(\w+)',
|
| ]
|
|
|
| for pattern in patterns:
|
| matches = re.findall(pattern, text, re.IGNORECASE)
|
| for source, target in matches:
|
| relationships.append({
|
| 'source': source,
|
| 'target': target,
|
| 'main_topic': main_topic,
|
| 'type': 'connection'
|
| })
|
|
|
| return relationships
|
|
|
| def _generate_summary(self, text: str, num_sentences: int = 3) -> str:
|
| """Generate summary of text"""
|
|
|
| sentences = re.split(r'[.!?]+', text)
|
| sentences = [s.strip() for s in sentences if s.strip()]
|
|
|
|
|
| summary_sentences = sentences[:num_sentences]
|
|
|
| return '. '.join(summary_sentences) + '.'
|
|
|
| def _calculate_confidence(self, knowledge: Dict) -> float:
|
| """Calculate how confident we are in learned knowledge"""
|
|
|
| confidence = 0.0
|
|
|
|
|
| if knowledge['definitions']:
|
| confidence += min(0.3, len(knowledge['definitions']) * 0.05)
|
|
|
|
|
| if knowledge['key_points']:
|
| confidence += min(0.3, len(knowledge['key_points']) * 0.05)
|
|
|
|
|
| if knowledge['relationships']:
|
| confidence += min(0.4, len(knowledge['relationships']) * 0.05)
|
|
|
| return min(1.0, confidence)
|
|
|
| def _store_knowledge(self, topic: str, knowledge: Dict):
|
| """Store learned knowledge"""
|
|
|
| if topic not in self.learned_knowledge:
|
| self.learned_knowledge[topic] = []
|
|
|
| self.learned_knowledge[topic].append({
|
| 'knowledge': knowledge,
|
| 'timestamp': self._get_timestamp(),
|
| 'confidence': knowledge['learning_confidence']
|
| })
|
|
|
| logger.info(f"Stored knowledge for topic: {topic}")
|
|
|
| def get_learned_knowledge(self, topic: str) -> Optional[Dict]:
|
| """Retrieve learned knowledge about a topic"""
|
|
|
| if topic not in self.learned_knowledge or not self.learned_knowledge[topic]:
|
| return None
|
|
|
|
|
| best = max(
|
| self.learned_knowledge[topic],
|
| key=lambda x: x['confidence']
|
| )
|
|
|
| return best['knowledge']
|
|
|
| def search_and_find_answer(self, question: str) -> Dict:
|
| """Search for answer to a question"""
|
|
|
| logger.info(f"Searching answer for: {question}")
|
|
|
|
|
| key_terms = self._extract_key_terms(question)
|
|
|
|
|
| search_results = []
|
| for term in key_terms[:3]:
|
| result = self.search_and_learn(term, num_results=3)
|
| if result.get('results'):
|
| search_results.extend(result['results'])
|
|
|
|
|
| answer = self._synthesize_answer(question, search_results)
|
|
|
| return {
|
| 'question': question,
|
| 'answer': answer,
|
| 'search_terms': key_terms,
|
| 'sources': len(search_results),
|
| 'confidence': len(search_results) / max(1, len(key_terms))
|
| }
|
|
|
| def _extract_key_terms(self, question: str) -> List[str]:
|
| """Extract key search terms from question"""
|
|
|
|
|
| question_words = ['what', 'how', 'why', 'when', 'where', 'who', 'which', 'is', 'are', 'the', 'a', 'an']
|
|
|
| words = question.lower().split()
|
| key_terms = [w for w in words if w not in question_words and len(w) > 2]
|
|
|
| return key_terms
|
|
|
| def _synthesize_answer(self, question: str, search_results: List) -> str:
|
| """Synthesize answer from search results"""
|
|
|
| if not search_results:
|
| return f"No information found for: {question}"
|
|
|
|
|
| answer_parts = []
|
| for result in search_results[:3]:
|
| if isinstance(result, dict) and 'snippet' in result:
|
| answer_parts.append(result['snippet'])
|
|
|
| if answer_parts:
|
| return ' '.join(answer_parts)
|
| else:
|
| return "Information found but synthesis incomplete."
|
|
|
| def _get_timestamp(self) -> str:
|
| """Get current timestamp"""
|
| from datetime import datetime
|
| return datetime.now().isoformat()
|
|
|
| def save_knowledge_cache(self):
|
| """Save learned knowledge to file"""
|
| try:
|
| os.makedirs('noahski_data', exist_ok=True)
|
|
|
| cache_file = 'noahski_data/google_learned_knowledge.json'
|
| with open(cache_file, 'w', encoding='utf-8') as f:
|
| json.dump({
|
| 'learned_knowledge': self.learned_knowledge,
|
| 'search_cache': self.search_cache,
|
| 'search_history': self.search_history,
|
| }, f, indent=2, ensure_ascii=False)
|
|
|
| logger.info(f"Saved knowledge cache: {len(self.learned_knowledge)} topics")
|
| except Exception as e:
|
| logger.error(f"Error saving knowledge cache: {e}")
|
|
|
| def load_knowledge_cache(self):
|
| """Load learned knowledge from file"""
|
| try:
|
| cache_file = 'noahski_data/google_learned_knowledge.json'
|
| if os.path.exists(cache_file):
|
| with open(cache_file, 'r', encoding='utf-8') as f:
|
| data = json.load(f)
|
|
|
| self.learned_knowledge = data.get('learned_knowledge', {})
|
| self.search_cache = data.get('search_cache', {})
|
| self.search_history = data.get('search_history', [])
|
|
|
| logger.info(f"Loaded knowledge cache: {len(self.learned_knowledge)} topics")
|
| except Exception as e:
|
| logger.error(f"Error loading knowledge cache: {e}")
|
|
|
| def get_learning_stats(self) -> Dict:
|
| """Get learning statistics"""
|
| return {
|
| 'total_searches': len(self.search_history),
|
| 'topics_learned': len(self.learned_knowledge),
|
| 'cached_results': len(self.search_cache),
|
| 'average_confidence': sum(
|
| max(k['confidence'] for k in v)
|
| for v in self.learned_knowledge.values() if v
|
| ) / max(1, len(self.learned_knowledge)),
|
| 'total_knowledge_points': sum(
|
| len(k.get('knowledge', {}).get('key_points', []))
|
| for v in self.learned_knowledge.values()
|
| for k in v
|
| )
|
| }
|
|
|
|
|
|
|
| _google_learner = None
|
|
|
| def get_google_learner() -> GoogleSearchLearner:
|
| """Get or create global Google search learner"""
|
| global _google_learner
|
| if _google_learner is None:
|
| _google_learner = GoogleSearchLearner()
|
| return _google_learner
|
|
|