Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """Novels_extraction.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1y4RtOWiKtyAarGmVWSxwe1cTvaiKtbOU | |
| """ | |
| import requests | |
| import pandas as pd | |
| from bs4 import BeautifulSoup | |
| import os | |
| import re | |
| import download_nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import PorterStemmer | |
| # Download stopwords from NLTK | |
| download_nltk.download('stopwords') | |
| download_nltk.download('punkt') | |
| stop_words = set(stopwords.words('english')) | |
| stemmer = PorterStemmer() | |
| def clean_html(text): | |
| soup = BeautifulSoup(text, 'html.parser') | |
| cleaned_text = soup.get_text() | |
| cleaned_text = ' '.join(cleaned_text.split()) | |
| return cleaned_text | |
| def extract_metadata(text): | |
| title_match = re.search(r"Title:\s*(.*)", text) | |
| author_match = re.search(r"Author:\s*(.*)", text) | |
| language_match = re.search(r"Language:\s*(.*)", text) | |
| metadata = { | |
| 'title': title_match.group(1).strip() if title_match else 'Unknown', | |
| 'author': author_match.group(1).strip() if author_match else 'Unknown', | |
| 'language': language_match.group(1).strip() if language_match else 'Unknown', | |
| } | |
| return metadata | |
| def preprocess_text(text): | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r"[^a-zA-Z0-9.,!?':;]", ' ', text) | |
| text = text.lower() | |
| tokens = text.split() | |
| filtered_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words] | |
| return ' '.join(filtered_tokens) | |
| def divide_into_sentences(text): | |
| """ Divide text into sentences using NLTK's sentence tokenizer. """ | |
| sentences = download_nltk.sent_tokenize(text) | |
| return sentences | |
| def fetch_and_preprocess_novels(urls): | |
| novels_data = [] | |
| for url in urls: | |
| try: | |
| response = requests.get(url) | |
| novel_text = response.text | |
| metadata = extract_metadata(novel_text) | |
| cleaned_text = clean_html(novel_text) | |
| # Divide the cleaned text into sentences and preprocess each sentence | |
| sentences = divide_into_sentences(cleaned_text) | |
| for sentence in sentences: | |
| original_sentence = sentence # Keep the original sentence for responses | |
| preprocessed_sentence = preprocess_text(sentence) # Preprocess for classification | |
| novel_entry = { | |
| 'title': metadata['title'], | |
| 'author': metadata['author'], | |
| 'language': metadata['language'], | |
| 'content_preprocessed': preprocessed_sentence, | |
| 'content_original': original_sentence | |
| } | |
| novels_data.append(novel_entry) | |
| except Exception as e: | |
| print(f'Error processing URL {url}: {str(e)}') | |
| return pd.DataFrame(novels_data) | |
| novel_directory = '/content' | |
| os.makedirs(novel_directory, exist_ok=True) | |
| urls = [ | |
| 'https://www.gutenberg.org/cache/epub/1661/pg1661.txt', | |
| 'https://www.gutenberg.org/cache/epub/132/pg132.txt', | |
| 'https://www.gutenberg.org/cache/epub/35/pg35.txt', | |
| 'https://www.gutenberg.org/cache/epub/147/pg147.txt', | |
| 'https://www.gutenberg.org/cache/epub/72159/pg72159.txt', | |
| 'https://www.gutenberg.org/cache/epub/67866/pg67866.txt', | |
| 'https://www.gutenberg.org/cache/epub/56062/pg56062.txt', | |
| 'https://www.gutenberg.org/cache/epub/67560/pg67560.txt', | |
| 'https://www.gutenberg.org/cache/epub/70698/pg70698.txt', | |
| 'https://www.gutenberg.org/cache/epub/56779/pg56779.txt', | |
| 'https://www.gutenberg.org/cache/epub/70797/pg70797.txt', | |
| 'https://www.gutenberg.org/cache/epub/70448/pg70448.txt', | |
| 'https://www.gutenberg.org/cache/epub/71087/pg71087.txt', | |
| 'https://www.gutenberg.org/cache/epub/71815/pg71815.txt', | |
| 'https://www.gutenberg.org/cache/epub/71049/pg71049.txt' | |
| ] | |
| novels_df = fetch_and_preprocess_novels(urls) | |
| novels_df['Document ID'] = range(1, len(novels_df) + 1) | |
| novels_df | |
| novels_df.to_csv('/content/novels_data.csv', index=False) |