Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import re | |
| import requests | |
| import sys | |
| import os | |
| import urllib | |
| from bs4 import BeautifulSoup | |
| import urllib.parse | |
| from urllib.parse import urlparse, urljoin | |
| from urllib3.exceptions import InsecureRequestWarning | |
| from urllib3 import disable_warnings | |
| import email.utils | |
| import pandas as pd | |
| disable_warnings(InsecureRequestWarning) | |
| def get_language_code(query): | |
| """ | |
| Search for a value given a key or search for a key given a value in the language_dict. | |
| Args: | |
| query (str): The key or value to search for. | |
| Returns: | |
| str: The corresponding value or key. | |
| """ | |
| for key, value in language_dict.items(): | |
| if query.lower() == key.lower(): | |
| return value | |
| elif query.lower() == value.lower(): | |
| return key | |
| return None | |
| # Example usage: | |
| language_dict = { | |
| "Spanish": "es", | |
| "French": "fr", | |
| "Swahili": "sw", | |
| "English": "en", | |
| "Chinese": "zh-hans", | |
| "Portuguese": "pt-br", | |
| "Russian": "ru", | |
| "Arabic": "ar" | |
| } | |
| #result_key = get_language_code("Spanish") | |
| #result_value = get_language_code("fr") | |
| #print(result_key) # Output: "fr" | |
| #print(result_value) # Output: "Spanish" | |
| #print(type(result_value)) | |
| # Extract node's number from UNEP URL | |
| def find_UNEP_node(unep_full_link: str) -> str: | |
| """find_UNEP_node access the input URL, finds the language version | |
| of the webpage, return the URL's node that is common to all UNEP languages. | |
| Args: | |
| unep_full_link (str): String of full web url in UNEP website. | |
| Returns: | |
| str: URL's node | |
| Examples: | |
| >>> convert_UNEP_url('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts') | |
| '34817' | |
| """ | |
| # Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276 | |
| req = urllib.request.Request(unep_full_link) | |
| req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
| req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
| req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
| try: | |
| response = urllib.request.urlopen(req) | |
| except urllib.error.HTTPError as e: | |
| print(f"HTTPError: {e.code} - {e.reason}") | |
| # You can raise a custom exception or handle the error in any other way | |
| except urllib.error.URLError as e: | |
| print(f"URLError: {e.reason}") | |
| # Handle other URL-related errors | |
| except Exception as e: | |
| print(f"An unexpected error occurred: {e}") | |
| # Handle other unexpected errors | |
| else: | |
| # If no exception occurred, continue text processing | |
| print("Scraping successful") | |
| r = urllib.request.urlopen(req).read().decode('utf-8') | |
| if r: | |
| # Convert html into BeautifulSoup object | |
| soup = BeautifulSoup(r, 'html.parser') | |
| #print(soup) | |
| # Find the <ul> element with class 'links' | |
| ul_element = soup.find('ul', class_='links') | |
| # Find the <li> element with class 'es is-active' | |
| li_element = ul_element.find('li', class_=lambda x: x and x.endswith('is-active')) | |
| # Extract the value of the 'data-drupal-link-system-path' attribute | |
| attribute_value = li_element.get('data-drupal-link-system-path') | |
| return attribute_value.split('node/')[1] | |
| # test | |
| #print(find_UNEP_node('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts')) | |
| #print(type(find_UNEP_node('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts'))) | |
| # Main function: finds the language version of a web article in UNEP website. | |
| def convert_UNEP_url(unep_full_link: str, target_lang: str = 'en') -> str: | |
| """convert_UNEP_url access the input URL, finds the URL of the translated version | |
| of the webpage in the input language, return an URL. | |
| Args: | |
| unep_full_link (str): String of full web url in UNEP website. | |
| target_lang (str): Target language, default = 'en'. | |
| Returns: | |
| str: New converted URL | |
| Examples: | |
| >>> convert_UNEP_url('https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts', 'es') | |
| 'https://www.unep.org/es/noticias-y-reportajes/reportajes/los-pueblos-indigenas-recurren-los-tribunales-ante-la-crisis' | |
| """ | |
| # Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276 | |
| req = urllib.request.Request(unep_full_link) | |
| req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
| req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
| req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
| try: | |
| response = urllib.request.urlopen(req) | |
| except urllib.error.HTTPError as e: | |
| print(f"HTTPError: {e.code} - {e.reason}") | |
| # You can raise a custom exception or handle the error in any other way | |
| return None | |
| except urllib.error.URLError as e: | |
| print(f"URLError: {e.reason}") | |
| # Handle other URL-related errors | |
| return None | |
| except Exception as e: | |
| print(f"An unexpected error occurred: {e}") | |
| # Handle other unexpected errors | |
| return None | |
| else: | |
| # If no exception occurred, continue text processing | |
| print("Scraping successful") | |
| r = urllib.request.urlopen(req).read().decode('utf-8') | |
| if r: | |
| # Convert html into BeautifulSoup object | |
| soup = BeautifulSoup(r, 'html.parser') | |
| #print(soup) | |
| # Looks for the link in the target language, whose class is "language-link" | |
| lenguas = soup.find("a", class_="language-link", hreflang = target_lang) | |
| #print(lenguas) | |
| if lenguas: | |
| #print(f"https://www.unep.org{lenguas['href']}") | |
| if lenguas['href'].endswith('/node'): | |
| return f"https://www.unep.org{lenguas['href'][0:-5]}" | |
| return f"https://www.unep.org{lenguas['href']}" | |
| elif not lenguas: | |
| # Find the <ul> element with class 'links' | |
| ul_element = soup.find('ul', class_='links') | |
| if ul_element: | |
| # Find the <li> element with class 'es is-active' | |
| li_element = ul_element.find('li', class_=lambda x: x and x.endswith('is-active')) | |
| # Extract the value of the 'data-drupal-link-system-path' attribute | |
| node_value = li_element.get('data-drupal-link-system-path') | |
| return find_from_nodeLink(int(node_value.split("/")[1]), target_lang) | |
| #return f"https://www.unep.org/{node_value}" | |
| else: | |
| raise ValueError("Error: Webpage accessed but the tag 'a', class_='language-link' was not found. Probably because the website was blocked by firewall/CloudFlare") | |
| return None | |
| else: | |
| print("\n<-- Error code. The programme could not access the webpage, forbidden") | |
| return None | |
| # test | |
| #input = input("Enter your UNEP url:") | |
| #input = 'https://www.unep.org/news-and-stories/story/climate-crisis-alters-their-lands-indigenous-peoples-turn-courts' | |
| #input = "https://www.unep.org/ru" | |
| #print(convert_UNEP_url(input, 'es')) | |
| #print(convert_UNEP_url(input, 'fr')) | |
| UNEP_LANG_CODES = ['ar', 'es', 'fr', 'ru', 'sw', 'pt-br', 'ch', 'zh', 'zh-hans', 'en'] | |
| def find_from_nodeLink(node_input, target_lang='empty'): | |
| """Replaces a node_link to the corresponding language. | |
| Args: | |
| node_input (str, int): Either a string of web URL containing the word 'node' and its ID, or an integer ID (or a string representation of an integer). | |
| target_lang (str): Target language, default = 'empty'. | |
| Returns: | |
| str: New converted URL | |
| Examples: | |
| >>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'fr') | |
| 'https://www.unep.org/fr/node/30010' | |
| >>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'empty') | |
| 'https://www.unep.org/node/30010' | |
| >>> find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'zh-hans') | |
| 'https://www.unep.org/zh-hans/node/30010' | |
| >>> find_from_nodeLink(30010, 'fr') | |
| 'https://www.unep.org/fr/node/30010' | |
| >>> find_from_nodeLink('30010', 'fr') | |
| 'https://www.unep.org/fr/node/30010' | |
| """ | |
| if isinstance(node_input, str) and node_input.isdigit(): | |
| node_input = int(node_input) | |
| if isinstance(node_input, int): | |
| node_url = f'https://www.unep.org/{target_lang}/node/{node_input}' | |
| elif isinstance(node_input, str): | |
| node_url = node_input | |
| else: | |
| raise ValueError("Error: Provide either a string URL or an integer ID (or a string representation of an integer)") | |
| pattern = r"https://www\.unep\.org/[a-z]*-?[a-z]*/?node/(\d+)" | |
| if target_lang == "empty": | |
| target_lang = "en" | |
| if target_lang in ["ch", 'zh', 'cn']: | |
| target_lang = "zh-hans" | |
| if target_lang in ['pt', 'pt-pt']: | |
| target_lang = "pt-br" | |
| if target_lang in UNEP_LANG_CODES: | |
| if re.findall(pattern, node_url): | |
| # Replace the language part in the URL | |
| new_url = re.sub(pattern, r"https://www.unep.org/{}/node/\1".format(target_lang), node_url) | |
| return new_url | |
| else: | |
| raise ValueError("Error: URL not found, or website blocked by firewall/CloudFare") | |
| else: | |
| raise ValueError("Error: Provide a language code among these: 'ar','es','fr','ru','sw','pt-br','zh-hans', 'en' or leave empty") | |
| # Generic scraper | |
| def get_HTML_generic(any_url: str) -> BeautifulSoup: | |
| """Any website link converter, it access the website and returns the HTML. | |
| Args: | |
| any_url (str): String of web url from the web wedocs.unep.org | |
| Returns: | |
| str: parsed HTML with BeautifulSoup | |
| Example: | |
| >>> convert_WeDocs_href('https://wedocs.unep.org/handle/20.500.11822/43104', 'Chinese') | |
| 'https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/PracticalGuide_ZH.pdf?sequence=5&isAllowed=y' | |
| """ | |
| req = urllib.request.Request(any_url) | |
| req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
| req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
| req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
| try: | |
| response = urllib.request.urlopen(req) | |
| except urllib.error.HTTPError as e: | |
| print(f"HTTPError: {e.code} - {e.reason} when accessing {any_url}") | |
| # You can raise a custom exception or handle the error in any other way | |
| except urllib.error.URLError as e: | |
| print(f"URLError: {e.reason} when accessing {any_url}") | |
| # Handle other URL-related errors | |
| except Exception as e: | |
| print(f"An unexpected error occurred: {e} when accessing {any_url}") | |
| # Handle other unexpected errors | |
| else: | |
| # If no exception occurred, continue text processing | |
| print("Scraping successful") | |
| r = urllib.request.urlopen(req).read().decode('utf-8') | |
| if r: | |
| # Convert html into BeautifulSoup object | |
| soup = BeautifulSoup(r, 'html.parser') | |
| return soup | |
| #print(soup) | |
| # Example usage with an integer ID provided as a string | |
| #print(find_from_nodeLink('30010', 'fr')) | |
| #print(find_from_nodeLink(30010, 'fr')) | |
| #print(find_from_nodeLink('https://www.unep.org/pt-br/node/30010', 'zh-hans')) | |
| def try_lang_switcher(switcher_soup, lang_code: str, base_url) -> str: | |
| # Find the <ul> element with class "language-switcher" | |
| #language_switcher_ul = switcher_soup.find('ul', class_='language-switcher') | |
| language_switcher_ul = switcher_soup.find('ul', class_=lambda value: value and value.startswith('language-switcher')) | |
| # Extract href values from <a> elements within the <ul> | |
| if language_switcher_ul: | |
| href_values = [a['href'] for a in language_switcher_ul.find_all('a')] | |
| for i, element in enumerate(href_values): | |
| if lang_code in element: | |
| new_link = urljoin(base_url, href_values[i]) | |
| return new_link | |
| return None | |
| # Function to concatenate absolute paths if URL cannot be accessed | |
| from urllib.parse import urljoin | |
| def concatenate_missing_segments(arg1, arg2): | |
| """ | |
| Concatenates missing URL segments from Arg1 to Arg2. | |
| Args: | |
| arg1 (str): The URL containing the missing segments, longer URL like | |
| "https://www.unep.org/interactive/explore-ecosystems/mountains/en/index.php#/mountain-intro" | |
| arg2 (str): The target URL, shorter URL like | |
| "https://www.unep.org/interactive/explore-ecosystems/mountains/ar" | |
| Returns: | |
| str: The concatenated URL. | |
| "https://www.unep.org/interactive/explore-ecosystems/mountains/ar/index.php#/mountain-intro" | |
| """ | |
| if len(arg1)>len(arg2): | |
| missing_segment = arg1[len(arg2):] | |
| return arg2 + missing_segment | |
| # Example usage: | |
| #arg1 = "https://www.unep.org/interactive/explore-ecosystems/mountains/en/index.php#/mountain-intro" | |
| #arg2 = "https://www.unep.org/interactive/explore-ecosystems/mountains/ar" | |
| #result = concatenate_missing_segments(arg1, arg2) | |
| #print(result) | |
| def convert_URL_anyWebsite(any_web_url: str, lang_code) -> str: | |
| # Access the URL to get the HTML with BeautifulSoup --> soup object | |
| sauce_html = get_HTML_generic(any_web_url) | |
| print(type(sauce_html)) | |
| if sauce_html: | |
| # Search the language_switcher HTML tag and gets the language code | |
| switcher_link = try_lang_switcher(sauce_html, lang_code.lower(), any_web_url) | |
| if switcher_link and get_HTML_generic(switcher_link): | |
| return switcher_link | |
| elif switcher_link: | |
| return concatenate_missing_segments(any_web_url, switcher_link) | |
| elif sauce_html.find_all(lambda tag: tag.has_attr('data-sf-role') and tag['data-sf-role'] == lang_code): #working for WHO news | |
| print("trying WHO") | |
| matching_tags = sauce_html.find_all(lambda tag: tag.has_attr('data-sf-role') and tag['data-sf-role'] == lang_code) | |
| if matching_tags: | |
| print(matching_tags) | |
| return matching_tags[0]['value'] | |
| elif sauce_html.find_all(lambda tag: tag.has_attr('hreflang') and tag['hreflang'] == lang_code): | |
| print("trying hreflang") | |
| matching_tags = sauce_html.find_all(lambda tag: tag.has_attr('hreflang') and tag['hreflang'] == lang_code) | |
| if matching_tags: | |
| return matching_tags[0]['href'] | |
| elif sauce_html: | |
| print("trying language_link") # working for UNESCO | |
| lang_tag = sauce_html.find("a", class_="language-link", hreflang = lang_code) | |
| #print(lang_tag) | |
| if lang_tag != None: | |
| return urljoin(any_web_url, lang_tag['href']) | |
| else: | |
| return None | |
| #output_li = convert_URL_anyWebsite("cleanairblueskies@un.org", "es") | |
| #print(output_li) | |
| def weDocs_short(weDocs_url) -> str: | |
| """Replaces a language specific WeDocs link with the landing page | |
| Args: | |
| weDocs_url (str): String of web url from the web wedocs.unep.org | |
| Returns: | |
| str: Landing page of the document, so it is not language specific. | |
| Example: | |
| >>> weDocs_short('https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/Practical_Guide.pdf?sequence=1&isAllowed=y') | |
| 'https://wedocs.unep.org/handle/20.500.11822/43104/' | |
| """ | |
| return re.sub(r"https://wedocs.unep.org/(bitstream/)?handle/([\w.-]+/\d+).+", r"https://wedocs.unep.org/handle/\2", weDocs_url) | |
| # WeDocs link converter, it access a short WeDocs link and returns a language-specific URL (pdf) | |
| def convert_WeDocs_href(url: str, target_lang: str ='English') -> str: | |
| """WeDocs link converter, it access a short WeDocs link | |
| and returns a language-specific URL (pdf) | |
| Args: | |
| weDocs_url (str): String of web url from the web wedocs.unep.org | |
| target_lang (str): Language code of the document to find. | |
| Returns: | |
| str: Download link of the PDF in the language requested. | |
| Example: | |
| >>> convert_WeDocs_href('https://wedocs.unep.org/handle/20.500.11822/43104', 'Chinese') | |
| 'https://wedocs.unep.org/bitstream/handle/20.500.11822/43104/PracticalGuide_ZH.pdf?sequence=5&isAllowed=y' | |
| """ | |
| try: | |
| # Send an HTTP GET request to the URL | |
| response = requests.get(url, verify=False) | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Parse the HTML content using BeautifulSoup | |
| pattern = re.compile(r".*{}.*".format(re.escape(target_lang.capitalize()))) # TODO normalize to take into account the Dico's key, in case user enters RU instead of Russian | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| #print(soup.prettify()) | |
| # Find the <a> tag with the word "Spanish" or the entered language name in its text | |
| # Extract the href attribute value | |
| lang_link = soup.find(string=re.compile(pattern)).parent['href'] | |
| #print(lang_link) | |
| if lang_link: | |
| # Merge the domain and PDF name to create the complete link | |
| clean_link = "https://wedocs.unep.org" + lang_link | |
| return clean_link | |
| else: | |
| return f"No link with '{target_lang}' text found." | |
| else: | |
| return "Failed to retrieve the URL." | |
| except Exception as e: | |
| return str(e) | |
| #spanish_href = extract_WeDocs_href(url, "Spanish") | |
| #portuguese_href = extract_WeDocs_href(url, "Portuguese") | |
| #ch_href = convert_WeDocs_href(url, "Chinese") | |
| #print(spanish_href) | |
| #print(portuguese_href) | |
| #print(ch_href) | |
| def access_un_library_by_id(user_input_id): | |
| try: | |
| # Base URL | |
| base_url = "https://digitallibrary.un.org/search?" | |
| # Construct the URL with the user-provided ID | |
| url = f"{base_url}ln=fr&p={user_input_id}&f=&c=Resource%20Type&c=UN%20Bodies&sf=&so=d&rg=50&fti=0" | |
| # Send an HTTP GET request to the URL | |
| response = requests.get(url) | |
| # Check if the request was successful (status code 200) | |
| if response.status_code == 200: | |
| print("Request was successful. Content:") | |
| # Parse the HTML content using BeautifulSoup | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Find the <div> with class="result-title" | |
| result_title_div = soup.find('div', class_='result-title') | |
| if result_title_div: | |
| # Find the first <a> tag within the result-title div and get its href value | |
| result_title_a = result_title_div.find('a', href=True) | |
| if result_title_a: | |
| href_value = result_title_a['href'] | |
| return f"https://digitallibrary.un.org{href_value}" | |
| else: | |
| print("No <a> tag found inside result-title.") | |
| else: | |
| print("No result-title div found in the HTML.") | |
| return None | |
| else: | |
| print(f"Failed to retrieve the URL. Status code: {response.status_code}") | |
| return None | |
| except Exception as e: | |
| print(f"An error occurred: {str(e)}") | |
| return None | |
| # Get user input for the ID | |
| #user_input_id = input("Enter the ID: ") | |
| # Call the function with user input | |
| #resultado = access_un_library_by_id(user_input_id) | |
| #print(resultado) | |
| # Send an URL request with headers to bypass CloudFlare as suggested by https://stackoverflow.com/a/74674276 | |
| def access_un_library_byResourceURL(landing_url: str) -> BeautifulSoup: | |
| req = urllib.request.Request(landing_url) | |
| req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') | |
| req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') | |
| req.add_header('Accept-Language', 'en-US,en;q=0.5') | |
| try: | |
| response = urllib.request.urlopen(req) | |
| except urllib.error.HTTPError as e: | |
| print(f"HTTPError: {e.code} - {e.reason}") | |
| # You can raise a custom exception or handle the error in any other way | |
| return None | |
| except urllib.error.URLError as e: | |
| print(f"URLError: {e.reason}") | |
| return None | |
| # Handle other URL-related errors | |
| except Exception as e: | |
| print(f"An unexpected error occurred: {e}") | |
| return None | |
| # Handle other unexpected errors | |
| else: | |
| # If no exception occurred, continue text processing | |
| print("Scraping successful") | |
| r = urllib.request.urlopen(req).read().decode('utf-8') | |
| if r: | |
| # Convert html into BeautifulSoup object | |
| soup = BeautifulSoup(r, 'html.parser') | |
| #print(soup) | |
| return soup | |
| else: | |
| # HTML error | |
| raise ValueError("Error in parsing the website content in HTML") | |
| return None | |
| def extract_info_UNdocLink(url, lang2_code): | |
| """ | |
| Extracts information from a given UNDocs URL. | |
| Args: | |
| url (str): The UNDocs URL. | |
| Returns: | |
| dict: A dictionary containing the extracted information. | |
| """ | |
| # Define a regex pattern to match the components in the URL | |
| # https://undocs.org/en/UNEP/EA.5/28/Corr.1 | |
| pattern = r'https://undocs\.org/([a-z]{2})?/?([A-Z]+)/(.*?)/(\d+)/(.*?)$' | |
| # Use regex to find the components in the URL | |
| match = re.match(pattern, url) | |
| if match: | |
| record_id = match.group(0) | |
| symbol = match.group(2) | |
| doc_type = match.group(3) | |
| unga = match.group(4) | |
| resolution_id = match.group(5) | |
| language_code = match.group(1) if match.group(1) else None # Optional language code | |
| return f"https://undocs.org/{lang2_code.lower()}/{symbol}/{doc_type}/{unga}/{resolution_id}" | |
| else: | |
| return None | |
| # Example usage: | |
| #url = "https://undocs.org/en/UNEP/EA.5/28/Corr.1" | |
| #result = extract_info_UNdocLink(url, "fr") | |
| #print(result) | |
| # Define the language dictionary | |
| language_dict = { | |
| "Spanish": "es", | |
| "French": "fr", | |
| "English": "en", | |
| "Chinese": "ch", | |
| "Russian": "ru", | |
| "Arabic": "ar" | |
| } | |
| #input_language = "Russian" | |
| # 1.7 UN Docs | |
| def get_jobID_undocs(url): | |
| """ | |
| Extracts the job ID from a given URL of the ny.un.org website. | |
| Args: | |
| url (str): The URL of the document on ny.un.org. | |
| Returns: | |
| str: The extracted job ID. | |
| """ | |
| # Define a regex pattern to match the job ID in the URL | |
| pattern = r'dds-ny.*/([A-Za-z0-9]+)\.pdf' | |
| # Use regex to find the job ID in the URL | |
| match = re.search(pattern, url) | |
| # Return the matched job ID or None if not found | |
| return match.group(1) if match else None | |
| # Extract the `value` attribute of <option> tags with the specified regex pattern | |
| def find_lang_UNdoc(un_docs_link, input_language): | |
| un_library_url = un_docs_link | |
| # Define the language dictionary | |
| UN_languages_dict = { | |
| "Spanish": "es", | |
| "French": "fr", | |
| "English": "en", | |
| "Chinese": "ch", | |
| "Russian": "ru", | |
| "Arabic": "ar" | |
| } | |
| if "undocs.org" in un_docs_link: | |
| #return extract_info_UNdocLink(un_docs_link, UN_languages_dict[input_language]) | |
| return extract_info_UNdocLink(un_docs_link, input_language) | |
| elif "dds-ny" in un_docs_link: | |
| #extract ID TODO | |
| un_library_url_ID = get_jobID_undocs(un_docs_link) | |
| print(un_library_url_ID) | |
| # Get URL from ID | |
| un_library_url = access_un_library_by_id(un_library_url_ID) | |
| print(un_library_url) | |
| elif "digitallibrary.un.org" in un_docs_link: | |
| un_library_url = un_docs_link | |
| try: | |
| # Get HTML from UN_lib URL | |
| #soup = access_un_library_byResourceURL(un_library_url) | |
| my_soup = access_un_library_byResourceURL(un_library_url) | |
| if my_soup is None: | |
| return None | |
| except Exception as e: | |
| print(f"An unexpected error occurred: {e}") | |
| return None | |
| else: | |
| # Define the regex pattern | |
| regex_pattern = r"-(\w{2})\.pdf" | |
| # Find all <option> tags | |
| options = my_soup.find_all('option', value=re.compile(regex_pattern)) | |
| # Extract and print the `value` attribute values | |
| for option in options: | |
| value = option['value'] | |
| match = re.search(regex_pattern, value) | |
| if match: | |
| language_code = match.group(1) | |
| # Check if the language code is in the language_dict | |
| language = next((k for k, v in UN_languages_dict.items() if v.startswith(language_code.lower())), 'Unknown') | |
| #print(f"Option Value: {value}, Language Code: {language_code}, Language: {language}") | |
| # Prepare the direct link for the requested language | |
| if language == input_language: | |
| output_links = [value] | |
| # Define a regular expression pattern with capture groups | |
| pattern = r"https://digitallibrary.un.org/record/(\d+)/files/([A-Z]+)_([A-Z]+)_([\d]+)_([\d]+)-(\w{2})\.pdf" | |
| # Use re.search to find matches and capture groups | |
| match = re.search(pattern, value) | |
| if match: | |
| # Extract capture group values | |
| record_id = match.group(1) | |
| symbol = match.group(2) # A | |
| doc_type = match.group(3) # RES | |
| unga = match.group(4) # 61 | |
| resolution_id = match.group(5) # 295 | |
| language_code = match.group(6) # es | |
| # Construct the output string # https://undocs.org/es/A/RES/61/295 | |
| output_links.append(f"https://undocs.org/{symbol}/{doc_type}/{unga}/{resolution_id}") | |
| output_links.append(f"https://undocs.org/{language_code.lower()}/{symbol}/{doc_type}/{unga}/{resolution_id}") | |
| else: | |
| print("No match found for the input string.") | |
| # Output is a list of 3 links: | |
| # 1 is UN Library: https://digitallibrary.un.org/record/606782/files/A_RES_61_295-ZH.pdf | |
| # 2 is UN Docs multilingual shortlink: https://undocs.org/A/RES/61/295 | |
| # 3 is UN Docs MONO-lingual shortlink: https://undocs.org/zh/A/RES/61/295 | |
| return output_links | |
| # Call the function to extract and print the option values | |
| #print(find_lang_UNdoc("https://undocs.org/en/UNEP/EA.5/28/Corr.1", "Russian")) | |
| #print(get_language_code("fr")) | |
| #print(find_lang_UNdoc("https://www.ohchr.org/en/documents/thematic-reports/ahrc3917-report-special-rapporteur-rights-indigenous-peoples", get_language_code("fr"))) | |
| import re | |
| def convert_Intl_Day(url, language_code): | |
| """ | |
| Converts the language code in a UN URL to the specified language. | |
| Args: | |
| url (str): The UN URL. | |
| language_code (str): The target language code. | |
| Returns: | |
| str: The modified URL with the specified language code. | |
| """ | |
| # Use regex to replace the language code in the URL | |
| if language_code.lower() == "ch": | |
| return re.sub(r'/([a-z]{2})/observances', f'/zh/observances', url) | |
| else: | |
| return re.sub(r'/([a-z]{2})/observances', f'/{language_code}/observances', url) | |
| # Example usage: | |
| #url = "https://www.un.org/es/observances/cities-day" | |
| #modified_url = convert_Intl_Day(url, "ch") | |
| #print(modified_url) | |
| import re | |
| def convert_URLendingBy_langEqualsCode(url, language_code): | |
| """ | |
| Converts the language code in a URL with the pattern ?lang=[A-Z]{2} to the specified language. | |
| No URL validation. | |
| Args: | |
| url (str): The URL. | |
| language_code (str): The target language code. | |
| Returns: | |
| str: The modified URL with the specified language code. | |
| """ | |
| if language_code.lower() == "ch": | |
| return re.sub(r'(\?lang=)[A-Z]{2}', fr'\1ZH', url) | |
| else: | |
| # Use regex to replace the language code in the URL | |
| return re.sub(r'(\?lang=)[A-Z]{2}', fr'\1{language_code.upper()}', url) | |
| # Example usage: | |
| #url = "https://www.unep.org/interactives/beat-plastic-pollution/?lang=ES" | |
| #modified_url = convert_URLendingBy_langEqualsCode(url, "ch") | |
| #print(modified_url) | |
| # Ultimate finder function | |
| def localize_URL(mi_URL: str, lengua: str="en") -> str: | |
| '''Apply all functions to try to find a language version of the input webpage | |
| in the provided language code. | |
| ''' | |
| resulting_link = None | |
| def is_email(string): | |
| print(f"Validating if {string} is an email:") | |
| email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') | |
| return bool(email_pattern.match(string)) | |
| # Check if URL is not an email | |
| if is_email(mi_URL): | |
| print(f"{mi_URL} is an email") | |
| return None | |
| else: | |
| #try UN Docs | |
| #TODO find a way to scrape this search engine https://documents.un.org/prod/ods.nsf/home.xsp | |
| # or how to download the PDF, access the symbol tag and join the url to undocs.org/ | |
| print("Trying find_lang_UNdoc for ", mi_URL) | |
| resulting_link = find_lang_UNdoc(mi_URL, get_language_code(lengua)) | |
| if resulting_link: | |
| return resulting_link[-1] | |
| # International Days | |
| if "/observances/" in mi_URL and "un.org/" in mi_URL: | |
| print("Trying convert_Intl_Day") | |
| resulting_link = convert_Intl_Day(mi_URL, lengua) | |
| return resulting_link | |
| # WeDocs UNEP | |
| if "wedocs.unep.org" in mi_URL: | |
| print("Trying convert_WeDocs_href") | |
| short_weDocs_url = weDocs_short(mi_URL) | |
| resulting_link = convert_WeDocs_href(short_weDocs_url, get_language_code(lengua)) | |
| return resulting_link | |
| # try UNEP articles | |
| if "unep.org" in mi_URL and "wedocs" not in mi_URL: | |
| print("Trying convert_UNEP_url") | |
| resulting_link = convert_UNEP_url(mi_URL, lengua) | |
| return resulting_link | |
| elif ".pdf" not in mi_URL: | |
| print("Trying convert_URL_anyWebsite") | |
| resulting_link = convert_URL_anyWebsite(mi_URL, lengua) | |
| print(resulting_link) | |
| if resulting_link is not None: | |
| return resulting_link | |
| else: | |
| return None | |
| #print(localize_URL("https://documents-dds-ny.un.org/doc/UNDOC/GEN/N06/512/07/PDF/N0651207.pdf?OpenElement", "fr")) | |
| #print(localize_URL("https://documents-dds-ny.un.org/doc/UNDOC/GEN/G16/015/38/PDF/G1601538.pdf?OpenElement", "fr")) | |
| #print(localize_URL("https://undocs.org/FCCC/CP/2015/10/Add.1", "fr")) | |
| #print(localize_URL("https://www.un.org/en/observances/environment-in-war-protection-day", "fr")) | |
| #print(localize_URL(url5, "fr")) | |
| def extract_href_attributes(html_content): | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # creates a list | |
| href_values = [a['href'] for a in soup.find_all('a', href=True)] | |
| return href_values | |
| #language_code = "es" | |
| #UNEP_URL_DOWNREPLACE = "https://www.unep.org/news-and-stories/press-release/global-annual-finance-flows-7-trillion-fueling-climate-biodiversity#" | |
| def extract_content_by_language(soup): | |
| # Find the div with id="field_body" | |
| field_body_div = soup.find('div', id='field_body') | |
| if field_body_div: | |
| # Helper function to recursively clean div tags deeper than direct children | |
| def clean_div_tags(tag): | |
| for child in tag.children: | |
| if child.name == 'div': | |
| clean_div_tags(child) | |
| else: | |
| content.append(str(child)) | |
| # Ignore secondary div tags and extract their children tags (except div tags) | |
| content = [] | |
| for tag in field_body_div.find_all(recursive=False): | |
| if tag.name == 'div': | |
| # Clean div tags deeper than direct children | |
| clean_div_tags(tag) | |
| else: | |
| # Include children tags (except div tags) | |
| content.append(str(tag)) | |
| return ''.join(content).strip() | |
| else: | |
| print(f"Div with id='field_body' not found in the HTML.") | |
| return None | |
| # Filter video frames and images HTML tags | |
| def transform_html_content(html_content): | |
| # Parse the HTML content | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Transform iframe tags with "youtu" in src attribute to oembed tags | |
| for iframe_tag in soup.find_all('iframe', src=lambda x: x and 'youtu' in x): | |
| src_attribute = iframe_tag['src'] | |
| video_id = src_attribute.split('/')[-1] # Extract video ID from the src attribute | |
| oembed_tag = soup.new_tag('oembed') | |
| oembed_tag.string = f'https://www.youtube.com/watch?v={video_id}' | |
| iframe_tag.replace_with(oembed_tag) | |
| # Merge figure tags and their children into a single img tag | |
| for figure_tag in soup.find_all('figure'): | |
| img_tag = figure_tag.find('img') | |
| if img_tag: | |
| # Create a new img tag with merged attributes | |
| new_img_tag = soup.new_tag('img') | |
| new_img_tag.attrs = img_tag.attrs | |
| figcaption_tag = figure_tag.find('figcaption') | |
| if figcaption_tag: | |
| # Extract the content of figcaption tag for data-caption attribute | |
| new_img_tag['data-caption'] = str(figcaption_tag.contents[0]) | |
| figure_tag.replace_with(new_img_tag) | |
| # Return the modified HTML content | |
| return soup | |
| # Link Replacer for HTML | |
| def localize_UNEP_html(language_code, soup): | |
| """ | |
| Localizes the href attributes of <a> tags in HTML content based on the given language code. | |
| Args: | |
| language_code (str): The language code used for URL localization. | |
| soup (BeautifulSoup): The BeautifulSoup object containing the parsed HTML content. | |
| Returns: | |
| str: The modified HTML content with localized href attributes. | |
| Example: | |
| language_code = "en" | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| modified_html = localize_UNEP_html(language_code, soup) | |
| print(modified_html) | |
| """ | |
| # Access the URL | |
| print(f"Accessing the URL, type: {type(soup)}") | |
| soup = get_HTML_generic(soup) | |
| print(f"Accessing parsed HTML: {type(soup)}") | |
| # Filter only translatable content | |
| soup = extract_content_by_language(soup) | |
| print(f"Filtered HTML: {type(soup)}") | |
| # Transform images and embedded YouTube videos | |
| soup = transform_html_content(soup) | |
| print(f"Transformed IMG and IFRAME tags: {type(soup)}") | |
| # Find all <a> tags in the HTML content | |
| for a_tag in soup.find_all('a'): | |
| # Get the current href attribute value | |
| current_href = a_tag.get('href', '') | |
| # Localize the URL using the provided language code | |
| localized_url = localize_URL(current_href, language_code) | |
| # Update the href attribute with the localized URL | |
| if localized_url is not None: | |
| a_tag['href'] = localized_url | |
| # Return the modified HTML content | |
| return str(soup) | |
| #Code created by Nelson JAIMES-QUINTERO | |
| # Define your custom function | |
| def render_html(htmltext, language): | |
| soup = BeautifulSoup(htmltext, 'html.parser') | |
| for a_tag in soup.find_all('a'): | |
| # Get the current href attribute value | |
| current_href = a_tag.get('href', '') | |
| # Localize the URL using the provided language code | |
| localized_url = localize_URL(current_href, language) | |
| # Update the href attribute with the localized URL | |
| if localized_url is not None: | |
| a_tag['href'] = localized_url | |
| # Return the modified HTML content | |
| output = str(soup) | |
| return output | |
| # Create the Gradio interface | |
| with gr.Blocks() as demo: | |
| html_input = gr.Textbox(label="Enter HTML Code", lines=10, placeholder="Paste your HTML code here. You can convert Word file's content into HTML by using html-cleaner.com") | |
| language_dropdown = gr.Dropdown(label="Select Language", choices=['es', 'fr', 'sw', 'en', 'zh-hans', 'pt-br', 'ru', 'ar'], value='es') | |
| html_output = gr.HTML(label="Rendered HTML") | |
| run_button = gr.Button("Find links!") | |
| run_button.click(render_html, inputs=[html_input, language_dropdown], outputs=html_output) | |
| # Launch the Gradio app with debug=True and share=True | |
| demo.launch() |