electric-otter commited on
Commit
08af447
·
verified ·
1 Parent(s): ce34696

Create reto.py

Browse files
Files changed (1) hide show
  1. reto.py +303 -0
reto.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import wikipedia
3
+ import requests
4
+ import re
5
+ import wikipediaapi
6
+ import textblob
7
+ from textblob import download_corpora
8
+ import random
9
+ from textblob import TextBlob
10
+ from urllib.parse import urljoin, urlparse
11
+ import time
12
+
13
+ download_corpora.download_all()
14
+
15
+ # Load spaCy model for better sentence processing
16
+ try:
17
+ nlp = spacy.load("en_core_web_sm")
18
+ except:
19
+ print("spaCy model not found. Please install it with: python -m spacy download en_core_web_sm")
20
+ nlp = None
21
+
22
+ # Option 1: Using wikipedia-api library (recommended)
23
+ def get_article_info_wiki_api(article_name):
24
+ """Get article content, categories, and links using wikipedia-api library"""
25
+ try:
26
+ wiki_wiki = wikipediaapi.Wikipedia(
27
+ 'en',
28
+ extract_format=wikipediaapi.ExtractFormat.WIKI
29
+ )
30
+ page = wiki_wiki.page(article_name)
31
+
32
+ if not page.exists():
33
+ return None, None, None
34
+
35
+ # Get categories and remove 'Category:' prefix
36
+ categories = [cat.replace('Category:', '') for cat in page.categories.keys()]
37
+
38
+ # Get external links from the page
39
+ external_links = extract_external_links(page.fullurl)
40
+
41
+ return page.text, categories if categories else ['Uncategorized'], external_links
42
+ except Exception as e:
43
+ print(f"Error in get_article_info_wiki_api: {e}")
44
+ return None, None, None
45
+
46
+ # Option 2: Using requests and regex (updated approach)
47
+ def get_article_info(article_name):
48
+ """Get article content and categories using web scraping approach"""
49
+ try:
50
+ # Get page content using wikipedia library with better error handling
51
+ try:
52
+ page = wikipedia.page(article_name, auto_suggest=True)
53
+ summary = page.summary
54
+ content = page.content
55
+ page_url = page.url
56
+ except wikipedia.exceptions.DisambiguationError as e:
57
+ # If it's a disambiguation page, use the first suggestion
58
+ print(f"Disambiguation page. Using first option: {e.options[0]}")
59
+ page = wikipedia.page(e.options[0], auto_suggest=False)
60
+ summary = page.summary
61
+ content = page.content
62
+ page_url = page.url
63
+ except wikipedia.exceptions.PageError:
64
+ print(f"Page '{article_name}' not found on Wikipedia")
65
+ return None, None, None
66
+
67
+ # Get categories via web scraping with updated regex
68
+ try:
69
+ r = requests.get(page_url, timeout=10)
70
+ html = r.text
71
+
72
+ # Updated regex pattern for categories
73
+ catlinks_regexp = re.compile(r'<div class="mw-normal-catlinks".*?>(.*?)<\/div>', re.DOTALL)
74
+ catnames_regexp = re.compile(r'<a[^>]*>([^<]*)<\/a>')
75
+
76
+ cat_src = catlinks_regexp.findall(html)
77
+ if not cat_src:
78
+ # Try alternative pattern
79
+ catlinks_regexp = re.compile(r'<div id="catlinks".*?>(.*?)<\/div>', re.DOTALL)
80
+ cat_src = catlinks_regexp.findall(html)
81
+
82
+ if not cat_src:
83
+ categories = ['Uncategorized']
84
+ else:
85
+ cats = catnames_regexp.findall(cat_src[0])
86
+ # Skip the first element which is typically "Categories:"
87
+ categories = cats[1:] if len(cats) > 1 else ['Uncategorized']
88
+
89
+ # Get external links
90
+ external_links = extract_external_links(page_url)
91
+
92
+ return content, categories, external_links
93
+
94
+ except requests.RequestException as e:
95
+ print(f"Request error: {e}")
96
+ # Fallback to using wikipedia library categories if available
97
+ if hasattr(page, 'categories'):
98
+ categories = list(page.categories)
99
+ return content, categories if categories else ['Uncategorized'], []
100
+ return content, ['Uncategorized'], []
101
+
102
+ except Exception as e:
103
+ print(f"Error in get_article_info: {e}")
104
+ return None, None, None
105
+
106
+ def extract_external_links(wikipedia_url):
107
+ """Extract external links from a Wikipedia page"""
108
+ try:
109
+ response = requests.get(wikipedia_url, timeout=10)
110
+ html_content = response.text
111
+
112
+ # Find the External links section
113
+ external_links_section = re.search(
114
+ r'<span class="mw-headline" id="External_links">External links</span>.*?(<ul>.*?</ul>)',
115
+ html_content,
116
+ re.DOTALL
117
+ )
118
+
119
+ if not external_links_section:
120
+ # Try alternative pattern
121
+ external_links_section = re.search(
122
+ r'<h2><span class="mw-headline" id="External_links">External links</span>.*?(<ul>.*?</ul>)',
123
+ html_content,
124
+ re.DOTALL
125
+ )
126
+
127
+ external_links = []
128
+ if external_links_section:
129
+ # Extract links from the section
130
+ links = re.findall(r'<a[^>]*href="([^"]*)"[^>]*>', external_links_section.group(1))
131
+
132
+ # Filter and format external links
133
+ for link in links:
134
+ # Skip internal Wikipedia links
135
+ if not link.startswith('/wiki/') and not link.startswith('#'):
136
+ # Make sure it's a valid URL
137
+ parsed = urlparse(link)
138
+ if parsed.scheme and parsed.netloc:
139
+ external_links.append(link)
140
+
141
+ return external_links[:10] # Return first 10 external links
142
+
143
+ except Exception as e:
144
+ print(f"Error extracting external links: {e}")
145
+ return []
146
+
147
+ def create_sentences_from_categories(categories):
148
+ """Create meaningful sentences from categories"""
149
+ sentences = []
150
+
151
+ if categories:
152
+ # Create a sentence listing the main categories
153
+ if len(categories) > 3:
154
+ main_categories = random.sample(categories, 3)
155
+ category_sentence = f"This article is primarily about {', '.join(main_categories[:-1])} and {main_categories[-1]}."
156
+ else:
157
+ if len(categories) > 1:
158
+ category_sentence = f"This article is about {', '.join(categories[:-1])} and {categories[-1]}."
159
+ else:
160
+ category_sentence = f"This article is about {categories[0]}."
161
+
162
+ sentences.append(category_sentence)
163
+
164
+ # Create additional sentences based on categories
165
+ for category in categories[:5]: # Limit to first 5 categories
166
+ sentences.append(f"It provides information related to {category}.")
167
+
168
+ return sentences
169
+
170
+ def extract_key_sentences(text, num_sentences=3):
171
+ """Extract key sentences from the article text"""
172
+ sentences = []
173
+
174
+ if text:
175
+ # Use spaCy for better sentence segmentation if available
176
+ if nlp:
177
+ doc = nlp(text)
178
+ sentences = [sent.text for sent in doc.sents]
179
+ else:
180
+ # Fallback to TextBlob
181
+ blob = TextBlob(text)
182
+ sentences = blob.sentences
183
+
184
+ # Return the first few sentences (usually the most important)
185
+ return sentences[:num_sentences]
186
+
187
+ return []
188
+
189
+ def get_references_from_text(text):
190
+ """Extract potential references from text using simple pattern matching"""
191
+ # Look for common citation patterns
192
+ patterns = [
193
+ r'\b(?:https?://|www\.)\S+',
194
+ r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # Emails
195
+ r'\b\d{4}\b.*?\b(?:press|university|institute|journal|research|general|code|greeting)\b', # Year + org
196
+ ]
197
+
198
+ references = []
199
+ for pattern in patterns:
200
+ matches = re.findall(pattern, text, re.IGNORECASE)
201
+ references.extend(matches)
202
+
203
+ return references[:5] # Return first 5 references
204
+
205
+ # Main execution
206
+ userinput = input("Enter a prompt: ")
207
+
208
+ # Using the wikipedia-api approach (recommended)
209
+ print("Using wikipedia-api approach:")
210
+ article_text, categories, external_links = get_article_info_wiki_api(userinput)
211
+
212
+ if article_text and categories:
213
+ print(f"Number of categories: {len(categories)}")
214
+ print(f"Categories: {categories}")
215
+
216
+ # Create sentences from categories
217
+ category_sentences = create_sentences_from_categories(categories)
218
+ for sentence in category_sentences:
219
+ print(f"- {sentence}")
220
+
221
+ # Extract key sentences from the article
222
+ key_sentences = extract_key_sentences(article_text)
223
+ for i, sentence in enumerate(key_sentences, 1):
224
+ print(f"{i}. {sentence}")
225
+
226
+ # Show external links for more data sources
227
+ if external_links:
228
+ print("\nExternal links for more data:")
229
+ for i, link in enumerate(external_links, 1):
230
+ print(f"{i}. {link}")
231
+ else:
232
+ print("\nNo external links found in this article.")
233
+
234
+ # Extract potential references from text
235
+ references = get_references_from_text(article_text)
236
+ if references:
237
+ print("\nPotential references found in text:")
238
+ for i, ref in enumerate(references, 1):
239
+ print(f"{i}. {ref}")
240
+
241
+ # Combine categories and article content
242
+ combined_text = " ".join(categories) + " " + article_text[:500] # First 500 chars of article
243
+ blob = TextBlob(combined_text)
244
+ words = blob.words
245
+ print(f"\nExtracted words from combined content: {set(words[:20])}") # Show first 20 unique words
246
+
247
+ else:
248
+ print("Page not found using wikipedia-api")
249
+
250
+ print("\n" + "="*50 + "\n")
251
+
252
+ # Using the web scraping approach
253
+ print("Using web scraping approach:")
254
+ article_text, categories, external_links = get_article_info(userinput)
255
+
256
+ if article_text and categories:
257
+ print(f"Categories: {categories}")
258
+
259
+ # Create sentences from categories
260
+ category_sentences = create_sentences_from_categories(categories)
261
+ print("\nSentences from categories:")
262
+ for sentence in category_sentences:
263
+ print(f"- {sentence}")
264
+
265
+ # Extract key sentences from the article
266
+ key_sentences = extract_key_sentences(article_text)
267
+ print("\nKey sentences from the article:")
268
+ for i, sentence in enumerate(key_sentences, 1):
269
+ print(f"{i}. {sentence}")
270
+
271
+ # Show external links for more data sources
272
+ if external_links:
273
+ print("\nExternal links for more data:")
274
+ for i, link in enumerate(external_links, 1):
275
+ print(f"{i}. {link}")
276
+ else:
277
+ print("\nNo external links found in this article.")
278
+ # Extract potential references from text
279
+ references = get_references_from_text(article_text)
280
+ if references:
281
+ print("\nPotential references found in text:")
282
+ for i, ref in enumerate(references, 1):
283
+ print(f"{i}. {ref}")
284
+
285
+ # Combine categories and article content
286
+ combined_text = " ".join(categories) + " " + article_text[:500] # First 500 chars of article
287
+ blob = TextBlob(combined_text)
288
+ words = blob.words
289
+ print(f"\nExtracted words from combined content: {set(words[:20])}") # Show first 20 unique words
290
+
291
+ else:
292
+ print("Page not found using web scraping")
293
+
294
+ # Additional functionality: Get related Wikipedia pages
295
+ print("\n" + "="*50)
296
+ print("Additional data collection options:")
297
+ print("1. Get related Wikipedia pages")
298
+ print("2. Search for academic papers on this topic")
299
+ print("3. Find news articles about this topic")
300
+ print("4. Extract data from external links")
301
+
302
+ # You could expand this section to implement these options
303
+ # For example, using APIs like Google Scholar, News API, etc.