novel_based_chatbot / src /novels_extraction.py
Bhushan4829
Initital Commit
08ded12
# -*- coding: utf-8 -*-
"""Novels_extraction.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1y4RtOWiKtyAarGmVWSxwe1cTvaiKtbOU
"""
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import re
import download_nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# Download stopwords from NLTK
download_nltk.download('stopwords')
download_nltk.download('punkt')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def clean_html(text):
soup = BeautifulSoup(text, 'html.parser')
cleaned_text = soup.get_text()
cleaned_text = ' '.join(cleaned_text.split())
return cleaned_text
def extract_metadata(text):
title_match = re.search(r"Title:\s*(.*)", text)
author_match = re.search(r"Author:\s*(.*)", text)
language_match = re.search(r"Language:\s*(.*)", text)
metadata = {
'title': title_match.group(1).strip() if title_match else 'Unknown',
'author': author_match.group(1).strip() if author_match else 'Unknown',
'language': language_match.group(1).strip() if language_match else 'Unknown',
}
return metadata
def preprocess_text(text):
text = re.sub(r'\s+', ' ', text)
text = re.sub(r"[^a-zA-Z0-9.,!?':;]", ' ', text)
text = text.lower()
tokens = text.split()
filtered_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
return ' '.join(filtered_tokens)
def divide_into_sentences(text):
""" Divide text into sentences using NLTK's sentence tokenizer. """
sentences = download_nltk.sent_tokenize(text)
return sentences
def fetch_and_preprocess_novels(urls):
novels_data = []
for url in urls:
try:
response = requests.get(url)
novel_text = response.text
metadata = extract_metadata(novel_text)
cleaned_text = clean_html(novel_text)
# Divide the cleaned text into sentences and preprocess each sentence
sentences = divide_into_sentences(cleaned_text)
for sentence in sentences:
original_sentence = sentence # Keep the original sentence for responses
preprocessed_sentence = preprocess_text(sentence) # Preprocess for classification
novel_entry = {
'title': metadata['title'],
'author': metadata['author'],
'language': metadata['language'],
'content_preprocessed': preprocessed_sentence,
'content_original': original_sentence
}
novels_data.append(novel_entry)
except Exception as e:
print(f'Error processing URL {url}: {str(e)}')
return pd.DataFrame(novels_data)
novel_directory = '/content'
os.makedirs(novel_directory, exist_ok=True)
urls = [
'https://www.gutenberg.org/cache/epub/1661/pg1661.txt',
'https://www.gutenberg.org/cache/epub/132/pg132.txt',
'https://www.gutenberg.org/cache/epub/35/pg35.txt',
'https://www.gutenberg.org/cache/epub/147/pg147.txt',
'https://www.gutenberg.org/cache/epub/72159/pg72159.txt',
'https://www.gutenberg.org/cache/epub/67866/pg67866.txt',
'https://www.gutenberg.org/cache/epub/56062/pg56062.txt',
'https://www.gutenberg.org/cache/epub/67560/pg67560.txt',
'https://www.gutenberg.org/cache/epub/70698/pg70698.txt',
'https://www.gutenberg.org/cache/epub/56779/pg56779.txt',
'https://www.gutenberg.org/cache/epub/70797/pg70797.txt',
'https://www.gutenberg.org/cache/epub/70448/pg70448.txt',
'https://www.gutenberg.org/cache/epub/71087/pg71087.txt',
'https://www.gutenberg.org/cache/epub/71815/pg71815.txt',
'https://www.gutenberg.org/cache/epub/71049/pg71049.txt'
]
novels_df = fetch_and_preprocess_novels(urls)
novels_df['Document ID'] = range(1, len(novels_df) + 1)
novels_df
novels_df.to_csv('/content/novels_data.csv', index=False)