| import spacy |
| import wikipedia |
| import requests |
| import re |
| import wikipediaapi |
| import textblob |
| from textblob import download_corpora |
| import random |
| from textblob import TextBlob |
| from urllib.parse import urljoin, urlparse |
| import time |
|
|
| download_corpora.download_all() |
|
|
| |
| try: |
| nlp = spacy.load("en_core_web_sm") |
| except: |
| print("spaCy model not found. Please install it with: python -m spacy download en_core_web_sm") |
| nlp = None |
|
|
| |
| def get_article_info_wiki_api(article_name): |
| """Get article content, categories, and links using wikipedia-api library""" |
| try: |
| wiki_wiki = wikipediaapi.Wikipedia( |
| 'en', |
| extract_format=wikipediaapi.ExtractFormat.WIKI |
| ) |
| page = wiki_wiki.page(article_name) |
| |
| if not page.exists(): |
| return None, None, None |
| |
| |
| categories = [cat.replace('Category:', '') for cat in page.categories.keys()] |
| |
| |
| external_links = extract_external_links(page.fullurl) |
| |
| return page.text, categories if categories else ['Uncategorized'], external_links |
| except Exception as e: |
| print(f"Error in get_article_info_wiki_api: {e}") |
| return None, None, None |
|
|
| |
| def get_article_info(article_name): |
| """Get article content and categories using web scraping approach""" |
| try: |
| |
| try: |
| page = wikipedia.page(article_name, auto_suggest=True) |
| summary = page.summary |
| content = page.content |
| page_url = page.url |
| except wikipedia.exceptions.DisambiguationError as e: |
| |
| print(f"Disambiguation page. Using first option: {e.options[0]}") |
| page = wikipedia.page(e.options[0], auto_suggest=False) |
| summary = page.summary |
| content = page.content |
| page_url = page.url |
| except wikipedia.exceptions.PageError: |
| print(f"Page '{article_name}' not found on Wikipedia") |
| return None, None, None |
| |
| |
| try: |
| r = requests.get(page_url, timeout=10) |
| html = r.text |
| |
| |
| catlinks_regexp = re.compile(r'<div class="mw-normal-catlinks".*?>(.*?)<\/div>', re.DOTALL) |
| catnames_regexp = re.compile(r'<a[^>]*>([^<]*)<\/a>') |
| |
| cat_src = catlinks_regexp.findall(html) |
| if not cat_src: |
| |
| catlinks_regexp = re.compile(r'<div id="catlinks".*?>(.*?)<\/div>', re.DOTALL) |
| cat_src = catlinks_regexp.findall(html) |
| |
| if not cat_src: |
| categories = ['Uncategorized'] |
| else: |
| cats = catnames_regexp.findall(cat_src[0]) |
| |
| categories = cats[1:] if len(cats) > 1 else ['Uncategorized'] |
| |
| |
| external_links = extract_external_links(page_url) |
| |
| return content, categories, external_links |
| |
| except requests.RequestException as e: |
| print(f"Request error: {e}") |
| |
| if hasattr(page, 'categories'): |
| categories = list(page.categories) |
| return content, categories if categories else ['Uncategorized'], [] |
| return content, ['Uncategorized'], [] |
| |
| except Exception as e: |
| print(f"Error in get_article_info: {e}") |
| return None, None, None |
|
|
| def extract_external_links(wikipedia_url): |
| """Extract external links from a Wikipedia page""" |
| try: |
| response = requests.get(wikipedia_url, timeout=10) |
| html_content = response.text |
| |
| |
| external_links_section = re.search( |
| r'<span class="mw-headline" id="External_links">External links</span>.*?(<ul>.*?</ul>)', |
| html_content, |
| re.DOTALL |
| ) |
| |
| if not external_links_section: |
| |
| external_links_section = re.search( |
| r'<h2><span class="mw-headline" id="External_links">External links</span>.*?(<ul>.*?</ul>)', |
| html_content, |
| re.DOTALL |
| ) |
| |
| external_links = [] |
| if external_links_section: |
| |
| links = re.findall(r'<a[^>]*href="([^"]*)"[^>]*>', external_links_section.group(1)) |
| |
| |
| for link in links: |
| |
| if not link.startswith('/wiki/') and not link.startswith('#'): |
| |
| parsed = urlparse(link) |
| if parsed.scheme and parsed.netloc: |
| external_links.append(link) |
| |
| return external_links[:10] |
| |
| except Exception as e: |
| print(f"Error extracting external links: {e}") |
| return [] |
|
|
| def create_sentences_from_categories(categories): |
| """Create meaningful sentences from categories""" |
| sentences = [] |
| |
| if categories: |
| |
| if len(categories) > 3: |
| main_categories = random.sample(categories, 3) |
| category_sentence = f"This article is primarily about {', '.join(main_categories[:-1])} and {main_categories[-1]}." |
| else: |
| if len(categories) > 1: |
| category_sentence = f"This article is about {', '.join(categories[:-1])} and {categories[-1]}." |
| else: |
| category_sentence = f"This article is about {categories[0]}." |
| |
| sentences.append(category_sentence) |
| |
| |
| for category in categories[:5]: |
| sentences.append(f"It provides information related to {category}.") |
| |
| return sentences |
|
|
| def extract_key_sentences(text, num_sentences=3): |
| """Extract key sentences from the article text""" |
| sentences = [] |
| |
| if text: |
| |
| if nlp: |
| doc = nlp(text) |
| sentences = [sent.text for sent in doc.sents] |
| else: |
| |
| blob = TextBlob(text) |
| sentences = blob.sentences |
| |
| |
| return sentences[:num_sentences] |
| |
| return [] |
|
|
| def get_references_from_text(text): |
| """Extract potential references from text using simple pattern matching""" |
| |
| patterns = [ |
| r'\b(?:https?://|www\.)\S+', |
| r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', |
| r'\b\d{4}\b.*?\b(?:press|university|institute|journal|research|general|code|greeting)\b', |
| ] |
| |
| references = [] |
| for pattern in patterns: |
| matches = re.findall(pattern, text, re.IGNORECASE) |
| references.extend(matches) |
| |
| return references[:5] |
|
|
| |
| userinput = input("Enter a prompt: ") |
|
|
| |
| print("Using wikipedia-api approach:") |
| article_text, categories, external_links = get_article_info_wiki_api(userinput) |
|
|
| if article_text and categories: |
| print(f"Number of categories: {len(categories)}") |
| print(f"Categories: {categories}") |
| |
| |
| category_sentences = create_sentences_from_categories(categories) |
| for sentence in category_sentences: |
| print(f"- {sentence}") |
| |
| |
| key_sentences = extract_key_sentences(article_text) |
| for i, sentence in enumerate(key_sentences, 1): |
| print(f"{i}. {sentence}") |
| |
| |
| if external_links: |
| print("\nExternal links for more data:") |
| for i, link in enumerate(external_links, 1): |
| print(f"{i}. {link}") |
| else: |
| print("\nNo external links found in this article.") |
| |
| |
| references = get_references_from_text(article_text) |
| if references: |
| print("\nPotential references found in text:") |
| for i, ref in enumerate(references, 1): |
| print(f"{i}. {ref}") |
| |
| |
| combined_text = " ".join(categories) + " " + article_text[:500] |
| blob = TextBlob(combined_text) |
| words = blob.words |
| print(f"\nExtracted words from combined content: {set(words[:20])}") |
| |
| else: |
| print("Page not found using wikipedia-api") |
|
|
| print("\n" + "="*50 + "\n") |
|
|
| |
| print("Using web scraping approach:") |
| article_text, categories, external_links = get_article_info(userinput) |
|
|
| if article_text and categories: |
| print(f"Categories: {categories}") |
| |
| |
| category_sentences = create_sentences_from_categories(categories) |
| print("\nSentences from categories:") |
| for sentence in category_sentences: |
| print(f"- {sentence}") |
| |
| |
| key_sentences = extract_key_sentences(article_text) |
| print("\nKey sentences from the article:") |
| for i, sentence in enumerate(key_sentences, 1): |
| print(f"{i}. {sentence}") |
| |
| |
| if external_links: |
| print("\nExternal links for more data:") |
| for i, link in enumerate(external_links, 1): |
| print(f"{i}. {link}") |
| else: |
| print("\nNo external links found in this article.") |
| |
| references = get_references_from_text(article_text) |
| if references: |
| print("\nPotential references found in text:") |
| for i, ref in enumerate(references, 1): |
| print(f"{i}. {ref}") |
| |
| |
| combined_text = " ".join(categories) + " " + article_text[:500] |
| blob = TextBlob(combined_text) |
| words = blob.words |
| print(f"\nExtracted words from combined content: {set(words[:20])}") |
| |
| else: |
| print("Page not found using web scraping") |
|
|
| |
| print("\n" + "="*50) |
| print("Additional data collection options:") |
| print("1. Get related Wikipedia pages") |
| print("2. Search for academic papers on this topic") |
| print("3. Find news articles about this topic") |
| print("4. Extract data from external links") |
|
|
| |
| |