Spaces:

bhushan4829
/

novel_based_chatbot

Runtime error

novel_based_chatbot / src /novels_extraction.py

Bhushan4829

Initital Commit

08ded12 about 1 year ago

4 kB

	# -- coding: utf-8 --
	"""Novels_extraction.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1y4RtOWiKtyAarGmVWSxwe1cTvaiKtbOU
	"""

	import requests
	import pandas as pd
	from bs4 import BeautifulSoup
	import os
	import re
	import download_nltk
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer

	# Download stopwords from NLTK
	download_nltk.download('stopwords')
	download_nltk.download('punkt')
	stop_words = set(stopwords.words('english'))
	stemmer = PorterStemmer()
	def clean_html(text):
	soup = BeautifulSoup(text, 'html.parser')
	cleaned_text = soup.get_text()
	cleaned_text = ' '.join(cleaned_text.split())
	return cleaned_text
	def extract_metadata(text):
	title_match = re.search(r"Title:\s(.)", text)
	author_match = re.search(r"Author:\s(.)", text)
	language_match = re.search(r"Language:\s(.)", text)

	metadata = {
	'title': title_match.group(1).strip() if title_match else 'Unknown',
	'author': author_match.group(1).strip() if author_match else 'Unknown',
	'language': language_match.group(1).strip() if language_match else 'Unknown',
	}
	return metadata
	def preprocess_text(text):
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r"[^a-zA-Z0-9.,!?':;]", ' ', text)
	text = text.lower()
	tokens = text.split()
	filtered_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
	return ' '.join(filtered_tokens)
	def divide_into_sentences(text):
	""" Divide text into sentences using NLTK's sentence tokenizer. """
	sentences = download_nltk.sent_tokenize(text)
	return sentences

	def fetch_and_preprocess_novels(urls):
	novels_data = []
	for url in urls:
	try:
	response = requests.get(url)
	novel_text = response.text
	metadata = extract_metadata(novel_text)
	cleaned_text = clean_html(novel_text)

	# Divide the cleaned text into sentences and preprocess each sentence
	sentences = divide_into_sentences(cleaned_text)
	for sentence in sentences:
	original_sentence = sentence # Keep the original sentence for responses
	preprocessed_sentence = preprocess_text(sentence) # Preprocess for classification
	novel_entry = {
	'title': metadata['title'],
	'author': metadata['author'],
	'language': metadata['language'],
	'content_preprocessed': preprocessed_sentence,
	'content_original': original_sentence
	}
	novels_data.append(novel_entry)

	except Exception as e:
	print(f'Error processing URL {url}: {str(e)}')
	return pd.DataFrame(novels_data)
	novel_directory = '/content'
	os.makedirs(novel_directory, exist_ok=True)
	urls = [
	'https://www.gutenberg.org/cache/epub/1661/pg1661.txt',
	'https://www.gutenberg.org/cache/epub/132/pg132.txt',
	'https://www.gutenberg.org/cache/epub/35/pg35.txt',
	'https://www.gutenberg.org/cache/epub/147/pg147.txt',
	'https://www.gutenberg.org/cache/epub/72159/pg72159.txt',
	'https://www.gutenberg.org/cache/epub/67866/pg67866.txt',
	'https://www.gutenberg.org/cache/epub/56062/pg56062.txt',
	'https://www.gutenberg.org/cache/epub/67560/pg67560.txt',
	'https://www.gutenberg.org/cache/epub/70698/pg70698.txt',
	'https://www.gutenberg.org/cache/epub/56779/pg56779.txt',
	'https://www.gutenberg.org/cache/epub/70797/pg70797.txt',
	'https://www.gutenberg.org/cache/epub/70448/pg70448.txt',
	'https://www.gutenberg.org/cache/epub/71087/pg71087.txt',
	'https://www.gutenberg.org/cache/epub/71815/pg71815.txt',
	'https://www.gutenberg.org/cache/epub/71049/pg71049.txt'
	]
	novels_df = fetch_and_preprocess_novels(urls)

	novels_df['Document ID'] = range(1, len(novels_df) + 1)

	novels_df

	novels_df.to_csv('/content/novels_data.csv', index=False)