Spaces:
Sleeping
Sleeping
Upload 10 files
Browse files- 1_get_books.py +131 -0
- 2_remove_intros_from_books.py +53 -0
- 3_chunking_the_books.py +56 -0
- 4_embed_chromadb.py +80 -0
- 5_querry_library.py +94 -0
- 6_chat_with_a_library.py +110 -0
- count_tokens.py +41 -0
- license_.txt +10 -0
- requirements.txt +3 -2
- token_counts.json +101 -0
1_get_books.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Import necessary libraries
|
| 2 |
+
import requests # For making HTTP requests
|
| 3 |
+
from bs4 import BeautifulSoup # For parsing HTML
|
| 4 |
+
import os # For file and directory operations
|
| 5 |
+
import re # For regular expressions
|
| 6 |
+
import time # For adding delays
|
| 7 |
+
import random # For generating random numbers
|
| 8 |
+
|
| 9 |
+
# Function to remove invalid characters from filenames
|
| 10 |
+
def sanitize_filename(filename):
|
| 11 |
+
return re.sub(r'[\\/*?:"<>|]', "", filename)
|
| 12 |
+
|
| 13 |
+
# Function to download a book given its URL and title
|
| 14 |
+
def download_book(url, title):
|
| 15 |
+
# Create a safe filename
|
| 16 |
+
filename = sanitize_filename(f"{title}.txt")
|
| 17 |
+
|
| 18 |
+
# Check if the file already exists
|
| 19 |
+
if os.path.exists(filename):
|
| 20 |
+
print(f"Skipping: {filename} (already downloaded)")
|
| 21 |
+
time.sleep(1/10) # Add a small delay
|
| 22 |
+
return True
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
# Get the book's webpage
|
| 26 |
+
response = requests.get(url)
|
| 27 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 28 |
+
|
| 29 |
+
# Find the download link for plain text version
|
| 30 |
+
download_link = soup.find('a', string='Plain Text UTF-8')
|
| 31 |
+
|
| 32 |
+
if download_link:
|
| 33 |
+
# Construct the full download URL
|
| 34 |
+
book_url = 'https://www.gutenberg.org' + download_link['href']
|
| 35 |
+
# Download the book content
|
| 36 |
+
book_content = requests.get(book_url).text
|
| 37 |
+
|
| 38 |
+
# Save the book content to a file
|
| 39 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
| 40 |
+
f.write(book_content)
|
| 41 |
+
print(f"Downloaded: {filename}")
|
| 42 |
+
# Add a random delay between 1 and 10 seconds
|
| 43 |
+
time.sleep(random.uniform(1, 10))
|
| 44 |
+
return True
|
| 45 |
+
else:
|
| 46 |
+
print(f"Could not find Plain Text UTF-8 download link for: {title}")
|
| 47 |
+
# Add a random delay between 1 and 3 seconds
|
| 48 |
+
time.sleep(random.uniform(1, 3))
|
| 49 |
+
return False
|
| 50 |
+
except requests.RequestException as e:
|
| 51 |
+
print(f"Network error downloading {title}: {e}")
|
| 52 |
+
time.sleep(random.uniform(1, 10)) # Add a random delay
|
| 53 |
+
return False
|
| 54 |
+
except OSError as e:
|
| 55 |
+
print(f"Error downloading {title}: {e}")
|
| 56 |
+
time.sleep(random.uniform(1, 10)) # Add a random delay
|
| 57 |
+
return False
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"Unexpected error downloading {title}: {e}")
|
| 60 |
+
time.sleep(random.uniform(1, 10)) # Add a random delay
|
| 61 |
+
return False
|
| 62 |
+
|
| 63 |
+
# Function to extract book links from a page
|
| 64 |
+
def get_book_links(url):
|
| 65 |
+
try:
|
| 66 |
+
# Get the webpage content
|
| 67 |
+
response = requests.get(url)
|
| 68 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 69 |
+
|
| 70 |
+
# Find all book elements
|
| 71 |
+
books = soup.find_all('li', class_='booklink')
|
| 72 |
+
|
| 73 |
+
# Extract title and URL for each book
|
| 74 |
+
for book in books:
|
| 75 |
+
title_element = book.find('span', class_='title')
|
| 76 |
+
if title_element:
|
| 77 |
+
title = title_element.text.strip()
|
| 78 |
+
book_url = 'https://www.gutenberg.org' + title_element.find_parent('a')['href']
|
| 79 |
+
yield (book_url, title)
|
| 80 |
+
except requests.RequestException as e:
|
| 81 |
+
print(f"Network error fetching book links: {e}")
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f"Unexpected error fetching book links: {e}")
|
| 84 |
+
|
| 85 |
+
# Function to get the URL of the next page
|
| 86 |
+
def get_next_page_url(url):
|
| 87 |
+
try:
|
| 88 |
+
# Get the webpage content
|
| 89 |
+
response = requests.get(url)
|
| 90 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 91 |
+
# Find the 'Next' link
|
| 92 |
+
next_link = soup.find('a', string='Next')
|
| 93 |
+
if next_link:
|
| 94 |
+
return 'https://www.gutenberg.org' + next_link['href']
|
| 95 |
+
return None
|
| 96 |
+
except requests.RequestException as e:
|
| 97 |
+
print(f"Network error fetching next page URL: {e}")
|
| 98 |
+
return None
|
| 99 |
+
except Exception as e:
|
| 100 |
+
print(f"Unexpected error fetching next page URL: {e}")
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
# Main execution block
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
# Set the initial search URL
|
| 106 |
+
base_url = "https://www.gutenberg.org/ebooks/search/?query=philosophy&submit_search=Go%21"
|
| 107 |
+
|
| 108 |
+
# Ask user how many books to download
|
| 109 |
+
num_books = int(input("How many books do you want to download? "))
|
| 110 |
+
|
| 111 |
+
# Create a 'books' directory if it doesn't exist
|
| 112 |
+
if not os.path.exists('books'):
|
| 113 |
+
os.makedirs('books')
|
| 114 |
+
os.chdir('books') # Change to the 'books' directory
|
| 115 |
+
|
| 116 |
+
current_url = base_url
|
| 117 |
+
books_downloaded = 0
|
| 118 |
+
|
| 119 |
+
# Main download loop
|
| 120 |
+
while books_downloaded < num_books and current_url:
|
| 121 |
+
for book_url, title in get_book_links(current_url):
|
| 122 |
+
if books_downloaded >= num_books:
|
| 123 |
+
break
|
| 124 |
+
if download_book(book_url, title):
|
| 125 |
+
books_downloaded += 1
|
| 126 |
+
|
| 127 |
+
# Move to the next page
|
| 128 |
+
current_url = get_next_page_url(current_url)
|
| 129 |
+
time.sleep(2) # Add a delay between pages
|
| 130 |
+
|
| 131 |
+
print(f"Downloaded {books_downloaded} new books.")
|
2_remove_intros_from_books.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Import necessary libraries
|
| 2 |
+
import os # For file and directory operations
|
| 3 |
+
import re # For regular expression operations
|
| 4 |
+
import asyncio # For asynchronous programming
|
| 5 |
+
import aiofiles # For asynchronous file I/O operations
|
| 6 |
+
|
| 7 |
+
# Function to remove the introduction from a book's content
|
| 8 |
+
async def remove_intro(content):
|
| 9 |
+
# Define a pattern to match everything up to and including the start of the Project Gutenberg eBook
|
| 10 |
+
pattern = r'(?s).*?\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\n'
|
| 11 |
+
# Remove the matched pattern from the content
|
| 12 |
+
return re.sub(pattern, '', content, count=1)
|
| 13 |
+
|
| 14 |
+
# Function to process a single file
|
| 15 |
+
async def process_file(input_path, output_path):
|
| 16 |
+
# Open and read the input file asynchronously
|
| 17 |
+
async with aiofiles.open(input_path, 'r', encoding='utf-8') as file:
|
| 18 |
+
content = await file.read()
|
| 19 |
+
|
| 20 |
+
# Remove the introduction from the content
|
| 21 |
+
processed_content = await remove_intro(content)
|
| 22 |
+
|
| 23 |
+
# Write the processed content to the output file asynchronously
|
| 24 |
+
async with aiofiles.open(output_path, 'w', encoding='utf-8') as file:
|
| 25 |
+
await file.write(processed_content)
|
| 26 |
+
|
| 27 |
+
# Main function to process all books
|
| 28 |
+
async def process_books():
|
| 29 |
+
input_folder = 'books' # Folder containing original books
|
| 30 |
+
output_folder = 'processed_books' # Folder to store processed books
|
| 31 |
+
|
| 32 |
+
# Create output folder if it doesn't exist
|
| 33 |
+
if not os.path.exists(output_folder):
|
| 34 |
+
os.makedirs(output_folder)
|
| 35 |
+
|
| 36 |
+
tasks = [] # List to store all processing tasks
|
| 37 |
+
# Iterate through all files in the input folder
|
| 38 |
+
for filename in os.listdir(input_folder):
|
| 39 |
+
if filename.endswith('.txt'): # Process only text files
|
| 40 |
+
input_path = os.path.join(input_folder, filename)
|
| 41 |
+
output_path = os.path.join(output_folder, filename)
|
| 42 |
+
# Create a task for processing each file and add it to the list
|
| 43 |
+
task = asyncio.create_task(process_file(input_path, output_path))
|
| 44 |
+
tasks.append(task)
|
| 45 |
+
|
| 46 |
+
# Wait for all tasks to complete
|
| 47 |
+
await asyncio.gather(*tasks)
|
| 48 |
+
print(f"All books processed. Output saved in '{output_folder}' folder.")
|
| 49 |
+
|
| 50 |
+
# Entry point of the script
|
| 51 |
+
if __name__ == '__main__':
|
| 52 |
+
# Run the main processing function asynchronously
|
| 53 |
+
asyncio.run(process_books())
|
3_chunking_the_books.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Import necessary libraries
|
| 2 |
+
import os # For file and directory operations
|
| 3 |
+
import json # For JSON file handling
|
| 4 |
+
from math import ceil # For rounding up numbers
|
| 5 |
+
|
| 6 |
+
# Function to split text into overlapping chunks
|
| 7 |
+
def chunk_text(text, chunk_size=500, overlap_percent=25):
|
| 8 |
+
# Split the text into words
|
| 9 |
+
words = text.split()
|
| 10 |
+
# Calculate the size of the overlap based on the chunk size and overlap percentage
|
| 11 |
+
overlap_size = int(chunk_size * overlap_percent / 100)
|
| 12 |
+
# Calculate the step size (distance between chunk starts)
|
| 13 |
+
step = chunk_size - overlap_size
|
| 14 |
+
|
| 15 |
+
chunks = []
|
| 16 |
+
# Iterate through the words, creating chunks
|
| 17 |
+
for i in range(0, len(words), step):
|
| 18 |
+
# Join words to form a chunk
|
| 19 |
+
chunk = ' '.join(words[i:i + chunk_size])
|
| 20 |
+
# Add the chunk to the list
|
| 21 |
+
chunks.append(chunk)
|
| 22 |
+
|
| 23 |
+
# Return the list of chunks
|
| 24 |
+
return chunks
|
| 25 |
+
|
| 26 |
+
# Function to process all books in a folder
|
| 27 |
+
def process_books(input_folder, output_file, chunk_size=500, overlap_percent=25):
|
| 28 |
+
# Dictionary to store chunks for all books
|
| 29 |
+
all_chunks = {}
|
| 30 |
+
|
| 31 |
+
# Iterate through all files in the input folder
|
| 32 |
+
for filename in os.listdir(input_folder):
|
| 33 |
+
# Process only text files
|
| 34 |
+
if filename.endswith('.txt'):
|
| 35 |
+
# Open and read the file
|
| 36 |
+
with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as file:
|
| 37 |
+
content = file.read()
|
| 38 |
+
# Split the book content into chunks
|
| 39 |
+
book_chunks = chunk_text(content, chunk_size, overlap_percent)
|
| 40 |
+
# Store the chunks for this book
|
| 41 |
+
all_chunks[filename] = book_chunks
|
| 42 |
+
|
| 43 |
+
# Write all chunks to a JSON file
|
| 44 |
+
with open(output_file, 'w', encoding='utf-8') as outfile:
|
| 45 |
+
json.dump(all_chunks, outfile, indent=4, ensure_ascii=False, separators=(',', ': '))
|
| 46 |
+
|
| 47 |
+
# Usage example
|
| 48 |
+
input_folder = 'processed_books' # Folder containing the books to process
|
| 49 |
+
output_file = 'chunked_books.json' # File to store the chunked books
|
| 50 |
+
chunk_size = 500 # Size of each chunk in words
|
| 51 |
+
overlap_percent = 25 # Percentage of overlap between chunks
|
| 52 |
+
|
| 53 |
+
# Process the books and save the chunks
|
| 54 |
+
process_books(input_folder, output_file, chunk_size, overlap_percent)
|
| 55 |
+
# Print a message indicating that the process is complete
|
| 56 |
+
print(f"Chunked books have been saved to {output_file}")
|
4_embed_chromadb.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Import necessary libraries
|
| 2 |
+
import json # For handling JSON data
|
| 3 |
+
import os # For interacting with the operating system
|
| 4 |
+
import asyncio # For asynchronous programming
|
| 5 |
+
import aiofiles # For asynchronous file operations
|
| 6 |
+
import chromadb # For vector database operations
|
| 7 |
+
from chromadb.utils import embedding_functions # For creating embedding functions
|
| 8 |
+
from termcolor import colored # For colored console output
|
| 9 |
+
|
| 10 |
+
# Function to load chunked books from a JSON file
|
| 11 |
+
async def load_chunked_books():
|
| 12 |
+
# Open and read the JSON file asynchronously
|
| 13 |
+
async with aiofiles.open('chunked_books.json', 'r', encoding='utf-8') as file:
|
| 14 |
+
return json.loads(await file.read())
|
| 15 |
+
|
| 16 |
+
# Function to embed a single chunk into the collection
|
| 17 |
+
async def embed_chunk(collection, book_title, i, chunk, semaphore):
|
| 18 |
+
# Use a semaphore to limit concurrent operations
|
| 19 |
+
async with semaphore:
|
| 20 |
+
# Add the chunk to the collection with metadata
|
| 21 |
+
collection.add(
|
| 22 |
+
documents=[chunk],
|
| 23 |
+
metadatas=[{"book_title": book_title, "chunk_index": i}],
|
| 24 |
+
ids=[f"{book_title}_chunk_{i}"]
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Function to process all chunks from the books
|
| 28 |
+
async def process_chunks(chunked_books, collection, max_books=None):
|
| 29 |
+
# Create a semaphore to limit concurrent operations to 30000
|
| 30 |
+
semaphore = asyncio.Semaphore(30000)
|
| 31 |
+
tasks = []
|
| 32 |
+
# Iterate through books and their chunks
|
| 33 |
+
for i, (book_title, chunks) in enumerate(chunked_books.items()):
|
| 34 |
+
# Break if max_books limit is reached
|
| 35 |
+
if max_books is not None and i >= max_books:
|
| 36 |
+
break
|
| 37 |
+
print(colored(f"Processing {book_title}", "green"))
|
| 38 |
+
# Create a task for each chunk
|
| 39 |
+
for j, chunk in enumerate(chunks):
|
| 40 |
+
task = asyncio.create_task(embed_chunk(collection, book_title, j, chunk, semaphore))
|
| 41 |
+
tasks.append(task)
|
| 42 |
+
|
| 43 |
+
total_chunks = len(tasks)
|
| 44 |
+
print(colored(f"Embedding {total_chunks} chunks", "blue"))
|
| 45 |
+
|
| 46 |
+
# Wait for all tasks to complete
|
| 47 |
+
await asyncio.gather(*tasks)
|
| 48 |
+
|
| 49 |
+
# Main function to orchestrate the embedding process
|
| 50 |
+
async def main():
|
| 51 |
+
# Load the chunked books
|
| 52 |
+
chunked_books = await load_chunked_books()
|
| 53 |
+
|
| 54 |
+
# Initialize Chroma client with persistence
|
| 55 |
+
client = chromadb.PersistentClient(path="./chroma_db")
|
| 56 |
+
|
| 57 |
+
# Create OpenAI embedding function
|
| 58 |
+
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
|
| 59 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
| 60 |
+
model_name="text-embedding-3-large"
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# Create or get the collection for storing embeddings
|
| 64 |
+
collection = client.get_or_create_collection(
|
| 65 |
+
name="books_collection",
|
| 66 |
+
embedding_function=openai_ef,
|
| 67 |
+
metadata={"hnsw:space": "cosine"}
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# Process and embed all chunks
|
| 71 |
+
await process_chunks(chunked_books, collection, max_books=None)
|
| 72 |
+
|
| 73 |
+
# Calculate and print the total number of embedded chunks
|
| 74 |
+
total_chunks = sum(len(chunks) for chunks in chunked_books.values())
|
| 75 |
+
print(f"Embedded and stored {total_chunks} chunks in the Chroma collection.")
|
| 76 |
+
|
| 77 |
+
# Entry point of the script
|
| 78 |
+
if __name__ == "__main__":
|
| 79 |
+
# Run the main function asynchronously
|
| 80 |
+
asyncio.run(main())
|
5_querry_library.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Import necessary libraries
|
| 2 |
+
import os # For environment variables
|
| 3 |
+
import json # For JSON operations
|
| 4 |
+
import chromadb # For vector database operations
|
| 5 |
+
from chromadb.utils import embedding_functions # For creating embedding functions
|
| 6 |
+
from termcolor import colored # For colored console output
|
| 7 |
+
|
| 8 |
+
# Function to query the collection
|
| 9 |
+
def query_collection(query, n_results=5):
|
| 10 |
+
# Initialize Chroma client with persistence
|
| 11 |
+
client = chromadb.PersistentClient(path="./chroma_db")
|
| 12 |
+
|
| 13 |
+
# Create OpenAI embedding function
|
| 14 |
+
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
|
| 15 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
| 16 |
+
model_name="text-embedding-3-large"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Get the collection with cosine similarity
|
| 20 |
+
collection = client.get_collection(
|
| 21 |
+
name="books_collection",
|
| 22 |
+
embedding_function=openai_ef,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# Query the collection
|
| 26 |
+
results = collection.query(
|
| 27 |
+
query_texts=[query],
|
| 28 |
+
n_results=n_results
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
return results
|
| 32 |
+
|
| 33 |
+
# Function to save query results to a JSON file
|
| 34 |
+
def save_results_to_json(results, query, n_results):
|
| 35 |
+
# Create a dictionary to store the results
|
| 36 |
+
json_results = {
|
| 37 |
+
"query": query,
|
| 38 |
+
"n_results": n_results,
|
| 39 |
+
"results": []
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# Iterate through the results and add them to the dictionary
|
| 43 |
+
for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0]), 1):
|
| 44 |
+
json_results["results"].append({
|
| 45 |
+
"result_number": i,
|
| 46 |
+
"book_title": metadata['book_title'],
|
| 47 |
+
"chunk_index": metadata['chunk_index'],
|
| 48 |
+
"content": doc
|
| 49 |
+
})
|
| 50 |
+
|
| 51 |
+
# Write the results to a JSON file
|
| 52 |
+
with open("query_results.json", "w") as f:
|
| 53 |
+
json.dump(json_results, f, indent=4)
|
| 54 |
+
|
| 55 |
+
# Main function to run the query system
|
| 56 |
+
def main():
|
| 57 |
+
# Print welcome message
|
| 58 |
+
print(colored("Welcome to the Book Query System!", "green"))
|
| 59 |
+
print("You can ask questions about the books in the collection.")
|
| 60 |
+
|
| 61 |
+
# Main loop for user interaction
|
| 62 |
+
while True:
|
| 63 |
+
# Get user query
|
| 64 |
+
query = input(colored("\nEnter your query (or 'quit' to exit): ", "yellow"))
|
| 65 |
+
|
| 66 |
+
# Check if user wants to quit
|
| 67 |
+
if query.lower() == 'quit':
|
| 68 |
+
break
|
| 69 |
+
|
| 70 |
+
# Get number of desired results
|
| 71 |
+
n_results = int(input(colored("How many results would you like? ", "yellow")))
|
| 72 |
+
|
| 73 |
+
# Query the collection
|
| 74 |
+
results = query_collection(query, n_results)
|
| 75 |
+
|
| 76 |
+
# Print the results
|
| 77 |
+
print(colored("\nTop matching chunks:", "green"))
|
| 78 |
+
for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0]), 1):
|
| 79 |
+
print(colored(f"\nResult {i}:", "blue"))
|
| 80 |
+
print(f"Book: {metadata['book_title']}")
|
| 81 |
+
print(f"Chunk Index: {metadata['chunk_index']}")
|
| 82 |
+
print(colored("Content:", "cyan"))
|
| 83 |
+
print(doc)
|
| 84 |
+
|
| 85 |
+
# Save results to JSON file
|
| 86 |
+
save_results_to_json(results, query, n_results)
|
| 87 |
+
print(colored("\nResults have been saved to query_results.json", "green"))
|
| 88 |
+
|
| 89 |
+
# Print goodbye message
|
| 90 |
+
print(colored("Thank you for using the Book Query System!", "green"))
|
| 91 |
+
|
| 92 |
+
# Check if the script is being run directly
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
main()
|
6_chat_with_a_library.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Import necessary libraries
|
| 2 |
+
import os # For environment variables
|
| 3 |
+
import json # For JSON operations
|
| 4 |
+
import chromadb # For vector database operations
|
| 5 |
+
from chromadb.utils import embedding_functions # For creating embedding functions
|
| 6 |
+
from termcolor import colored # For colored console output
|
| 7 |
+
from openai import OpenAI # For interacting with OpenAI's API
|
| 8 |
+
|
| 9 |
+
# Function to query the vector database
|
| 10 |
+
def query_collection(query, n_results):
|
| 11 |
+
# Initialize the Chroma client with a persistent database
|
| 12 |
+
client = chromadb.PersistentClient(path="./chroma_db")
|
| 13 |
+
# Create an OpenAI embedding function
|
| 14 |
+
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
|
| 15 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
| 16 |
+
model_name="text-embedding-3-large"
|
| 17 |
+
)
|
| 18 |
+
# Get the collection from the database
|
| 19 |
+
collection = client.get_collection(
|
| 20 |
+
name="books_collection",
|
| 21 |
+
embedding_function=openai_ef,
|
| 22 |
+
)
|
| 23 |
+
# Query the collection with the given query and number of results
|
| 24 |
+
results = collection.query(
|
| 25 |
+
query_texts=[query],
|
| 26 |
+
n_results=n_results
|
| 27 |
+
)
|
| 28 |
+
return results
|
| 29 |
+
|
| 30 |
+
# Function to save retrieved chunks to a JSON file
|
| 31 |
+
def save_chunks_to_json(results, query):
|
| 32 |
+
# Prepare the data structure for JSON
|
| 33 |
+
json_data = {
|
| 34 |
+
"query": query,
|
| 35 |
+
"chunks": []
|
| 36 |
+
}
|
| 37 |
+
# Iterate through the results and add them to the data structure
|
| 38 |
+
for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
|
| 39 |
+
chunk = {
|
| 40 |
+
"book_title": metadata['book_title'],
|
| 41 |
+
"chunk_index": metadata['chunk_index'],
|
| 42 |
+
"content": doc
|
| 43 |
+
}
|
| 44 |
+
json_data["chunks"].append(chunk)
|
| 45 |
+
|
| 46 |
+
# Write the data to a JSON file
|
| 47 |
+
with open("retrieved_chunks.json", "w") as f:
|
| 48 |
+
json.dump(json_data, f, indent=4)
|
| 49 |
+
|
| 50 |
+
# Print a confirmation message
|
| 51 |
+
print(colored("Retrieved chunks have been saved to retrieved_chunks.json", "green"))
|
| 52 |
+
|
| 53 |
+
# Function to generate a response using OpenAI's API
|
| 54 |
+
def generate_response(client, query, context):
|
| 55 |
+
# Prepare the messages for the chat completion
|
| 56 |
+
messages = [
|
| 57 |
+
{"role": "system", "content": "You are a helpful assistant. Answer the query based on the provided context. in 100 words or less. unless user asks for more detail. Do not refer to any author but just combine the best pieces of the provided content to answer the query. Only answer based on the provided content."},
|
| 58 |
+
{"role": "user", "content": f"Context:\n{context}\n\nQuery: {query}"}
|
| 59 |
+
]
|
| 60 |
+
# Create a streaming chat completion
|
| 61 |
+
stream = client.chat.completions.create(
|
| 62 |
+
model="gpt-4o",
|
| 63 |
+
messages=messages,
|
| 64 |
+
stream=True
|
| 65 |
+
)
|
| 66 |
+
# Print the response as it's generated
|
| 67 |
+
for chunk in stream:
|
| 68 |
+
if chunk.choices[0].delta.content is not None:
|
| 69 |
+
print(colored(chunk.choices[0].delta.content, "green"), end='', flush=True)
|
| 70 |
+
|
| 71 |
+
print()
|
| 72 |
+
|
| 73 |
+
# Main function to run the program
|
| 74 |
+
def main():
|
| 75 |
+
# Print welcome message
|
| 76 |
+
print(colored("Welcome to the Book Query System!", "green"))
|
| 77 |
+
# Initialize OpenAI client
|
| 78 |
+
client = OpenAI()
|
| 79 |
+
# Get the number of chunks to use for context from the user
|
| 80 |
+
n_results = int(input(colored("How many chunks would you like to use for context? ", "yellow")))
|
| 81 |
+
|
| 82 |
+
# Main loop for user interaction
|
| 83 |
+
while True:
|
| 84 |
+
# Get the user's query
|
| 85 |
+
query = input(colored("\nEnter your query (or 'quit' to exit): ", "yellow"))
|
| 86 |
+
if query.lower() == 'quit':
|
| 87 |
+
break
|
| 88 |
+
|
| 89 |
+
# Query the collection
|
| 90 |
+
results = query_collection(query, n_results)
|
| 91 |
+
|
| 92 |
+
# Save the retrieved chunks to a JSON file
|
| 93 |
+
save_chunks_to_json(results, query)
|
| 94 |
+
|
| 95 |
+
# Prepare the context for the AI response
|
| 96 |
+
context = "\n\n".join([
|
| 97 |
+
f"Book: {metadata['book_title']}\nChunk Index: {metadata['chunk_index']}\nContent: {doc}"
|
| 98 |
+
for doc, metadata in zip(results['documents'][0], results['metadatas'][0])
|
| 99 |
+
])
|
| 100 |
+
|
| 101 |
+
# Generate and print the AI response
|
| 102 |
+
print(colored("\nGenerating response based on the context...", "green"))
|
| 103 |
+
generate_response(client, query, context)
|
| 104 |
+
|
| 105 |
+
# Print goodbye message
|
| 106 |
+
print(colored("Thank you for using the Book Query System!", "green"))
|
| 107 |
+
|
| 108 |
+
# Run the main function if this script is executed directly
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
main()
|
count_tokens.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import tiktoken
|
| 4 |
+
|
| 5 |
+
def count_tokens(text, encoding_name="cl100k_base"):
|
| 6 |
+
encoding = tiktoken.get_encoding(encoding_name)
|
| 7 |
+
return len(encoding.encode(text))
|
| 8 |
+
|
| 9 |
+
def count_words(text):
|
| 10 |
+
return len(text.split())
|
| 11 |
+
|
| 12 |
+
def process_books(input_folder, output_file):
|
| 13 |
+
all_counts = {}
|
| 14 |
+
total_tokens = 0
|
| 15 |
+
total_words = 0
|
| 16 |
+
|
| 17 |
+
for filename in os.listdir(input_folder):
|
| 18 |
+
if filename.endswith('.txt'):
|
| 19 |
+
with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as file:
|
| 20 |
+
content = file.read()
|
| 21 |
+
tokens = count_tokens(content)
|
| 22 |
+
words = count_words(content)
|
| 23 |
+
all_counts[filename] = {"tokens": tokens, "words": words}
|
| 24 |
+
total_tokens += tokens
|
| 25 |
+
total_words += words
|
| 26 |
+
|
| 27 |
+
# Save token counts, word counts, and totals to a JSON file
|
| 28 |
+
all_counts["total_tokens"] = total_tokens
|
| 29 |
+
all_counts["total_words"] = total_words
|
| 30 |
+
with open(output_file, 'w', encoding='utf-8') as outfile:
|
| 31 |
+
json.dump(all_counts, outfile, indent=4, ensure_ascii=False)
|
| 32 |
+
|
| 33 |
+
print(f"Total tokens across all books: {total_tokens}")
|
| 34 |
+
print(f"Total words across all books: {total_words}")
|
| 35 |
+
print(f"Token and word counts for each book and totals have been saved to {output_file}")
|
| 36 |
+
|
| 37 |
+
# Usage
|
| 38 |
+
input_folder = 'processed_books'
|
| 39 |
+
output_file = 'token_and_word_counts.json'
|
| 40 |
+
|
| 41 |
+
process_books(input_folder, output_file)
|
license_.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
all software provided by echohive is provided "as is", without warranty of any kind, express or
|
| 3 |
+
implied, including but not limited to the warranties of merchantability,
|
| 4 |
+
fitness for a particular purpose and noninfringement. in no event shall the
|
| 5 |
+
authors or copyright holders be liable for any claim, damages or other
|
| 6 |
+
liability, whether in an action of contract, tort or otherwise, arising from,
|
| 7 |
+
out of or in connection with the software or the use or other dealings in
|
| 8 |
+
the software.
|
| 9 |
+
|
| 10 |
+
the software shall not be used for any malicious purposes
|
requirements.txt
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
| 1 |
+
openai==1.34.0
|
| 2 |
+
chromadb==0.4.24
|
| 3 |
+
termcolor==2.4.0
|
token_counts.json
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"A little philosophy of life.txt": 9132,
|
| 3 |
+
"A System of Logic, Ratiocinative and Inductive.txt": 546472,
|
| 4 |
+
"A Treatise of Human Nature.txt": 296762,
|
| 5 |
+
"Aesthetical Essays of Friedrich Schiller.txt": 189986,
|
| 6 |
+
"Ainsi Parlait Zarathoustra (French).txt": 194454,
|
| 7 |
+
"Also sprach Zarathustra Ein Buch für Alle und Keinen (German).txt": 169738,
|
| 8 |
+
"An Enquiry Concerning Human Understanding.txt": 82190,
|
| 9 |
+
"An Enquiry Concerning the Principles of Morals.txt": 70435,
|
| 10 |
+
"Anthropology [a lecture delivered at Columbia University in the series on science, philosophy and art, December 18, 1907].txt": 13097,
|
| 11 |
+
"Aphorismen zur Lebensweisheit (German).txt": 140512,
|
| 12 |
+
"Apology.txt": 24980,
|
| 13 |
+
"Aristotle on the art of poetry.txt": 30192,
|
| 14 |
+
"Autobiography.txt": 102169,
|
| 15 |
+
"Beyond Good and Evil.txt": 94142,
|
| 16 |
+
"Bushido, the Soul of Japan.txt": 51071,
|
| 17 |
+
"Considerations on Representative Government.txt": 125315,
|
| 18 |
+
"Democracy and Education An Introduction to the Philosophy of Education.txt": 176129,
|
| 19 |
+
"Der Wille zur Macht Eine Auslegung alles Geschehens (German).txt": 224176,
|
| 20 |
+
"Dialogues Concerning Natural Religion.txt": 52536,
|
| 21 |
+
"Discours de la méthode (French).txt": 204766,
|
| 22 |
+
"Discourse on the Method of Rightly Conducting One's Reason and of Seeking Truth in the Sciences.txt": 32497,
|
| 23 |
+
"Essais de Montaigne (self-édition) - Volume I (French).txt": 582802,
|
| 24 |
+
"Essays of Schopenhauer.txt": 102591,
|
| 25 |
+
"Ethics.txt": 125702,
|
| 26 |
+
"Euthyphro.txt": 17073,
|
| 27 |
+
"Fables de La Fontaine (French).txt": 165627,
|
| 28 |
+
"Fathers and Sons.txt": 117927,
|
| 29 |
+
"Fundamental Principles of the Metaphysic of Morals.txt": 42791,
|
| 30 |
+
"Hegel's Philosophy of Mind.txt": 172635,
|
| 31 |
+
"Ion.txt": 12818,
|
| 32 |
+
"La Pensée de l'Humanité (French).txt": 157229,
|
| 33 |
+
"Laughter An Essay on the Meaning of the Comic.txt": 56658,
|
| 34 |
+
"Man and Superman A Comedy and a Philosophy.txt": 101410,
|
| 35 |
+
"Mysticism and Logic and Other Essays.txt": 100246,
|
| 36 |
+
"Nature.txt": 24614,
|
| 37 |
+
"Nursing as Caring A Model for Transforming Practice.txt": 56142,
|
| 38 |
+
"On Liberty.txt": 71629,
|
| 39 |
+
"On the Nature of Things.txt": 115376,
|
| 40 |
+
"Parmenides.txt": 50415,
|
| 41 |
+
"Pascal's Pensées.txt": 162822,
|
| 42 |
+
"Phaedo.txt": 59297,
|
| 43 |
+
"Philosophiae Naturalis Principia Mathematica (Latin).txt": 276663,
|
| 44 |
+
"Politics A Treatise on Government.txt": 136638,
|
| 45 |
+
"Pragmatism A New Name for Some Old Ways of Thinking.txt": 74273,
|
| 46 |
+
"Principios e questões de philosophia politica (Vol. 1 of 2) (Portuguese).txt": 95611,
|
| 47 |
+
"Protagoras.txt": 40959,
|
| 48 |
+
"Second Treatise of Government.txt": 78427,
|
| 49 |
+
"Selections from the Writings of Kierkegaard.txt": 118409,
|
| 50 |
+
"Siddhartha eine indische Dichtung (German).txt": 71013,
|
| 51 |
+
"Siddhartha.txt": 57396,
|
| 52 |
+
"Six metaphysical meditations.txt": 57199,
|
| 53 |
+
"Sophist.txt": 66723,
|
| 54 |
+
"Symbolic Logic.txt": 131562,
|
| 55 |
+
"Symposium.txt": 47194,
|
| 56 |
+
"The Advancement of Learning.txt": 121880,
|
| 57 |
+
"The Analects of Confucius (from the Chinese Classics).txt": 48003,
|
| 58 |
+
"The Analysis of Mind.txt": 119143,
|
| 59 |
+
"The Antichrist.txt": 53358,
|
| 60 |
+
"The Birth of Tragedy; or, Hellenism and Pessimism.txt": 80627,
|
| 61 |
+
"The Case of Wagner, Nietzsche Contra Wagner, and Selected Aphorisms..txt": 45557,
|
| 62 |
+
"The Communist Manifesto.txt": 19440,
|
| 63 |
+
"The Consolation of Philosophy.txt": 64645,
|
| 64 |
+
"The Critique of Practical Reason.txt": 85468,
|
| 65 |
+
"The Critique of Pure Reason.txt": 279141,
|
| 66 |
+
"The Elementary Forms of the Religious Life.txt": 309826,
|
| 67 |
+
"The Ethics of Aristotle.txt": 153589,
|
| 68 |
+
"The Golden Sayings of Epictetus, with the Hymn of Cleanthes.txt": 36620,
|
| 69 |
+
"The Joyful Wisdom (La Gaya Scienza).txt": 126443,
|
| 70 |
+
"The Kybalion.txt": 49626,
|
| 71 |
+
"The Law.txt": 30680,
|
| 72 |
+
"The Life of Reason The Phases of Human Progress.txt": 426776,
|
| 73 |
+
"The Lives and Opinions of Eminent Philosophers.txt": 284048,
|
| 74 |
+
"The Oxford Book of American Essays.txt": 224431,
|
| 75 |
+
"The Picture of Dorian Gray.txt": 110023,
|
| 76 |
+
"The Poetics of Aristotle.txt": 24964,
|
| 77 |
+
"The Prince.txt": 71159,
|
| 78 |
+
"The Problems of Philosophy.txt": 59012,
|
| 79 |
+
"The Republic.txt": 282758,
|
| 80 |
+
"The Sayings of Confucius.txt": 52309,
|
| 81 |
+
"The Secret Doctrine, Vol. 1 of 4.txt": 494622,
|
| 82 |
+
"The Secret Doctrine, Vol. 3 of 4.txt": 385066,
|
| 83 |
+
"The symbolism of Freemasonry Illustrating and explaining its science and philosophy, its legends, myths and symbols.txt": 148348,
|
| 84 |
+
"The Tao Teh King, or the Tao and its Characteristics.txt": 18758,
|
| 85 |
+
"The Theory of Moral Sentiments.txt": 158439,
|
| 86 |
+
"The Twilight of the Idols; or, How to Philosophize with the Hammer. The Antichrist.txt": 103709,
|
| 87 |
+
"The Varieties of Religious Experience A Study in Human Nature.txt": 267612,
|
| 88 |
+
"The Will to Believe, and Other Essays in Popular Philosophy.txt": 145720,
|
| 89 |
+
"The Will to Power An Attempted Transvaluation of All Values. Book I and II.txt": 129710,
|
| 90 |
+
"The Will to Power An Attempted Transvaluation of All Values. Book III and IV.txt": 144631,
|
| 91 |
+
"The Works of Edgar Allan Poe — Volume 5.txt": 116428,
|
| 92 |
+
"The World as Will and Idea (Vol. 1 of 3).txt": 258295,
|
| 93 |
+
"The World as Will and Idea (Vol. 3 of 3).txt": 288437,
|
| 94 |
+
"Theaetetus.txt": 93043,
|
| 95 |
+
"Thus Spake Zarathustra A Book for All and None.txt": 165671,
|
| 96 |
+
"Utilitarianism.txt": 39631,
|
| 97 |
+
"Voltaire's Philosophical Dictionary.txt": 120793,
|
| 98 |
+
"What I believe.txt": 20310,
|
| 99 |
+
"What Is Art.txt": 115537,
|
| 100 |
+
"total_tokens": 12752908
|
| 101 |
+
}
|