Narayana02 commited on
Commit
c769427
·
verified ·
1 Parent(s): 161d9bb

Upload 10 files

Browse files
1_get_books.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import requests # For making HTTP requests
3
+ from bs4 import BeautifulSoup # For parsing HTML
4
+ import os # For file and directory operations
5
+ import re # For regular expressions
6
+ import time # For adding delays
7
+ import random # For generating random numbers
8
+
9
+ # Function to remove invalid characters from filenames
10
+ def sanitize_filename(filename):
11
+ return re.sub(r'[\\/*?:"<>|]', "", filename)
12
+
13
+ # Function to download a book given its URL and title
14
+ def download_book(url, title):
15
+ # Create a safe filename
16
+ filename = sanitize_filename(f"{title}.txt")
17
+
18
+ # Check if the file already exists
19
+ if os.path.exists(filename):
20
+ print(f"Skipping: {filename} (already downloaded)")
21
+ time.sleep(1/10) # Add a small delay
22
+ return True
23
+
24
+ try:
25
+ # Get the book's webpage
26
+ response = requests.get(url)
27
+ soup = BeautifulSoup(response.content, 'html.parser')
28
+
29
+ # Find the download link for plain text version
30
+ download_link = soup.find('a', string='Plain Text UTF-8')
31
+
32
+ if download_link:
33
+ # Construct the full download URL
34
+ book_url = 'https://www.gutenberg.org' + download_link['href']
35
+ # Download the book content
36
+ book_content = requests.get(book_url).text
37
+
38
+ # Save the book content to a file
39
+ with open(filename, 'w', encoding='utf-8') as f:
40
+ f.write(book_content)
41
+ print(f"Downloaded: {filename}")
42
+ # Add a random delay between 1 and 10 seconds
43
+ time.sleep(random.uniform(1, 10))
44
+ return True
45
+ else:
46
+ print(f"Could not find Plain Text UTF-8 download link for: {title}")
47
+ # Add a random delay between 1 and 3 seconds
48
+ time.sleep(random.uniform(1, 3))
49
+ return False
50
+ except requests.RequestException as e:
51
+ print(f"Network error downloading {title}: {e}")
52
+ time.sleep(random.uniform(1, 10)) # Add a random delay
53
+ return False
54
+ except OSError as e:
55
+ print(f"Error downloading {title}: {e}")
56
+ time.sleep(random.uniform(1, 10)) # Add a random delay
57
+ return False
58
+ except Exception as e:
59
+ print(f"Unexpected error downloading {title}: {e}")
60
+ time.sleep(random.uniform(1, 10)) # Add a random delay
61
+ return False
62
+
63
+ # Function to extract book links from a page
64
+ def get_book_links(url):
65
+ try:
66
+ # Get the webpage content
67
+ response = requests.get(url)
68
+ soup = BeautifulSoup(response.content, 'html.parser')
69
+
70
+ # Find all book elements
71
+ books = soup.find_all('li', class_='booklink')
72
+
73
+ # Extract title and URL for each book
74
+ for book in books:
75
+ title_element = book.find('span', class_='title')
76
+ if title_element:
77
+ title = title_element.text.strip()
78
+ book_url = 'https://www.gutenberg.org' + title_element.find_parent('a')['href']
79
+ yield (book_url, title)
80
+ except requests.RequestException as e:
81
+ print(f"Network error fetching book links: {e}")
82
+ except Exception as e:
83
+ print(f"Unexpected error fetching book links: {e}")
84
+
85
+ # Function to get the URL of the next page
86
+ def get_next_page_url(url):
87
+ try:
88
+ # Get the webpage content
89
+ response = requests.get(url)
90
+ soup = BeautifulSoup(response.content, 'html.parser')
91
+ # Find the 'Next' link
92
+ next_link = soup.find('a', string='Next')
93
+ if next_link:
94
+ return 'https://www.gutenberg.org' + next_link['href']
95
+ return None
96
+ except requests.RequestException as e:
97
+ print(f"Network error fetching next page URL: {e}")
98
+ return None
99
+ except Exception as e:
100
+ print(f"Unexpected error fetching next page URL: {e}")
101
+ return None
102
+
103
+ # Main execution block
104
+ if __name__ == "__main__":
105
+ # Set the initial search URL
106
+ base_url = "https://www.gutenberg.org/ebooks/search/?query=philosophy&submit_search=Go%21"
107
+
108
+ # Ask user how many books to download
109
+ num_books = int(input("How many books do you want to download? "))
110
+
111
+ # Create a 'books' directory if it doesn't exist
112
+ if not os.path.exists('books'):
113
+ os.makedirs('books')
114
+ os.chdir('books') # Change to the 'books' directory
115
+
116
+ current_url = base_url
117
+ books_downloaded = 0
118
+
119
+ # Main download loop
120
+ while books_downloaded < num_books and current_url:
121
+ for book_url, title in get_book_links(current_url):
122
+ if books_downloaded >= num_books:
123
+ break
124
+ if download_book(book_url, title):
125
+ books_downloaded += 1
126
+
127
+ # Move to the next page
128
+ current_url = get_next_page_url(current_url)
129
+ time.sleep(2) # Add a delay between pages
130
+
131
+ print(f"Downloaded {books_downloaded} new books.")
2_remove_intros_from_books.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import os # For file and directory operations
3
+ import re # For regular expression operations
4
+ import asyncio # For asynchronous programming
5
+ import aiofiles # For asynchronous file I/O operations
6
+
7
+ # Function to remove the introduction from a book's content
8
+ async def remove_intro(content):
9
+ # Define a pattern to match everything up to and including the start of the Project Gutenberg eBook
10
+ pattern = r'(?s).*?\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\n'
11
+ # Remove the matched pattern from the content
12
+ return re.sub(pattern, '', content, count=1)
13
+
14
+ # Function to process a single file
15
+ async def process_file(input_path, output_path):
16
+ # Open and read the input file asynchronously
17
+ async with aiofiles.open(input_path, 'r', encoding='utf-8') as file:
18
+ content = await file.read()
19
+
20
+ # Remove the introduction from the content
21
+ processed_content = await remove_intro(content)
22
+
23
+ # Write the processed content to the output file asynchronously
24
+ async with aiofiles.open(output_path, 'w', encoding='utf-8') as file:
25
+ await file.write(processed_content)
26
+
27
+ # Main function to process all books
28
+ async def process_books():
29
+ input_folder = 'books' # Folder containing original books
30
+ output_folder = 'processed_books' # Folder to store processed books
31
+
32
+ # Create output folder if it doesn't exist
33
+ if not os.path.exists(output_folder):
34
+ os.makedirs(output_folder)
35
+
36
+ tasks = [] # List to store all processing tasks
37
+ # Iterate through all files in the input folder
38
+ for filename in os.listdir(input_folder):
39
+ if filename.endswith('.txt'): # Process only text files
40
+ input_path = os.path.join(input_folder, filename)
41
+ output_path = os.path.join(output_folder, filename)
42
+ # Create a task for processing each file and add it to the list
43
+ task = asyncio.create_task(process_file(input_path, output_path))
44
+ tasks.append(task)
45
+
46
+ # Wait for all tasks to complete
47
+ await asyncio.gather(*tasks)
48
+ print(f"All books processed. Output saved in '{output_folder}' folder.")
49
+
50
+ # Entry point of the script
51
+ if __name__ == '__main__':
52
+ # Run the main processing function asynchronously
53
+ asyncio.run(process_books())
3_chunking_the_books.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import os # For file and directory operations
3
+ import json # For JSON file handling
4
+ from math import ceil # For rounding up numbers
5
+
6
+ # Function to split text into overlapping chunks
7
+ def chunk_text(text, chunk_size=500, overlap_percent=25):
8
+ # Split the text into words
9
+ words = text.split()
10
+ # Calculate the size of the overlap based on the chunk size and overlap percentage
11
+ overlap_size = int(chunk_size * overlap_percent / 100)
12
+ # Calculate the step size (distance between chunk starts)
13
+ step = chunk_size - overlap_size
14
+
15
+ chunks = []
16
+ # Iterate through the words, creating chunks
17
+ for i in range(0, len(words), step):
18
+ # Join words to form a chunk
19
+ chunk = ' '.join(words[i:i + chunk_size])
20
+ # Add the chunk to the list
21
+ chunks.append(chunk)
22
+
23
+ # Return the list of chunks
24
+ return chunks
25
+
26
+ # Function to process all books in a folder
27
+ def process_books(input_folder, output_file, chunk_size=500, overlap_percent=25):
28
+ # Dictionary to store chunks for all books
29
+ all_chunks = {}
30
+
31
+ # Iterate through all files in the input folder
32
+ for filename in os.listdir(input_folder):
33
+ # Process only text files
34
+ if filename.endswith('.txt'):
35
+ # Open and read the file
36
+ with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as file:
37
+ content = file.read()
38
+ # Split the book content into chunks
39
+ book_chunks = chunk_text(content, chunk_size, overlap_percent)
40
+ # Store the chunks for this book
41
+ all_chunks[filename] = book_chunks
42
+
43
+ # Write all chunks to a JSON file
44
+ with open(output_file, 'w', encoding='utf-8') as outfile:
45
+ json.dump(all_chunks, outfile, indent=4, ensure_ascii=False, separators=(',', ': '))
46
+
47
+ # Usage example
48
+ input_folder = 'processed_books' # Folder containing the books to process
49
+ output_file = 'chunked_books.json' # File to store the chunked books
50
+ chunk_size = 500 # Size of each chunk in words
51
+ overlap_percent = 25 # Percentage of overlap between chunks
52
+
53
+ # Process the books and save the chunks
54
+ process_books(input_folder, output_file, chunk_size, overlap_percent)
55
+ # Print a message indicating that the process is complete
56
+ print(f"Chunked books have been saved to {output_file}")
4_embed_chromadb.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import json # For handling JSON data
3
+ import os # For interacting with the operating system
4
+ import asyncio # For asynchronous programming
5
+ import aiofiles # For asynchronous file operations
6
+ import chromadb # For vector database operations
7
+ from chromadb.utils import embedding_functions # For creating embedding functions
8
+ from termcolor import colored # For colored console output
9
+
10
+ # Function to load chunked books from a JSON file
11
+ async def load_chunked_books():
12
+ # Open and read the JSON file asynchronously
13
+ async with aiofiles.open('chunked_books.json', 'r', encoding='utf-8') as file:
14
+ return json.loads(await file.read())
15
+
16
+ # Function to embed a single chunk into the collection
17
+ async def embed_chunk(collection, book_title, i, chunk, semaphore):
18
+ # Use a semaphore to limit concurrent operations
19
+ async with semaphore:
20
+ # Add the chunk to the collection with metadata
21
+ collection.add(
22
+ documents=[chunk],
23
+ metadatas=[{"book_title": book_title, "chunk_index": i}],
24
+ ids=[f"{book_title}_chunk_{i}"]
25
+ )
26
+
27
+ # Function to process all chunks from the books
28
+ async def process_chunks(chunked_books, collection, max_books=None):
29
+ # Create a semaphore to limit concurrent operations to 30000
30
+ semaphore = asyncio.Semaphore(30000)
31
+ tasks = []
32
+ # Iterate through books and their chunks
33
+ for i, (book_title, chunks) in enumerate(chunked_books.items()):
34
+ # Break if max_books limit is reached
35
+ if max_books is not None and i >= max_books:
36
+ break
37
+ print(colored(f"Processing {book_title}", "green"))
38
+ # Create a task for each chunk
39
+ for j, chunk in enumerate(chunks):
40
+ task = asyncio.create_task(embed_chunk(collection, book_title, j, chunk, semaphore))
41
+ tasks.append(task)
42
+
43
+ total_chunks = len(tasks)
44
+ print(colored(f"Embedding {total_chunks} chunks", "blue"))
45
+
46
+ # Wait for all tasks to complete
47
+ await asyncio.gather(*tasks)
48
+
49
+ # Main function to orchestrate the embedding process
50
+ async def main():
51
+ # Load the chunked books
52
+ chunked_books = await load_chunked_books()
53
+
54
+ # Initialize Chroma client with persistence
55
+ client = chromadb.PersistentClient(path="./chroma_db")
56
+
57
+ # Create OpenAI embedding function
58
+ openai_ef = embedding_functions.OpenAIEmbeddingFunction(
59
+ api_key=os.getenv("OPENAI_API_KEY"),
60
+ model_name="text-embedding-3-large"
61
+ )
62
+
63
+ # Create or get the collection for storing embeddings
64
+ collection = client.get_or_create_collection(
65
+ name="books_collection",
66
+ embedding_function=openai_ef,
67
+ metadata={"hnsw:space": "cosine"}
68
+ )
69
+
70
+ # Process and embed all chunks
71
+ await process_chunks(chunked_books, collection, max_books=None)
72
+
73
+ # Calculate and print the total number of embedded chunks
74
+ total_chunks = sum(len(chunks) for chunks in chunked_books.values())
75
+ print(f"Embedded and stored {total_chunks} chunks in the Chroma collection.")
76
+
77
+ # Entry point of the script
78
+ if __name__ == "__main__":
79
+ # Run the main function asynchronously
80
+ asyncio.run(main())
5_querry_library.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import os # For environment variables
3
+ import json # For JSON operations
4
+ import chromadb # For vector database operations
5
+ from chromadb.utils import embedding_functions # For creating embedding functions
6
+ from termcolor import colored # For colored console output
7
+
8
+ # Function to query the collection
9
+ def query_collection(query, n_results=5):
10
+ # Initialize Chroma client with persistence
11
+ client = chromadb.PersistentClient(path="./chroma_db")
12
+
13
+ # Create OpenAI embedding function
14
+ openai_ef = embedding_functions.OpenAIEmbeddingFunction(
15
+ api_key=os.getenv("OPENAI_API_KEY"),
16
+ model_name="text-embedding-3-large"
17
+ )
18
+
19
+ # Get the collection with cosine similarity
20
+ collection = client.get_collection(
21
+ name="books_collection",
22
+ embedding_function=openai_ef,
23
+ )
24
+
25
+ # Query the collection
26
+ results = collection.query(
27
+ query_texts=[query],
28
+ n_results=n_results
29
+ )
30
+
31
+ return results
32
+
33
+ # Function to save query results to a JSON file
34
+ def save_results_to_json(results, query, n_results):
35
+ # Create a dictionary to store the results
36
+ json_results = {
37
+ "query": query,
38
+ "n_results": n_results,
39
+ "results": []
40
+ }
41
+
42
+ # Iterate through the results and add them to the dictionary
43
+ for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0]), 1):
44
+ json_results["results"].append({
45
+ "result_number": i,
46
+ "book_title": metadata['book_title'],
47
+ "chunk_index": metadata['chunk_index'],
48
+ "content": doc
49
+ })
50
+
51
+ # Write the results to a JSON file
52
+ with open("query_results.json", "w") as f:
53
+ json.dump(json_results, f, indent=4)
54
+
55
+ # Main function to run the query system
56
+ def main():
57
+ # Print welcome message
58
+ print(colored("Welcome to the Book Query System!", "green"))
59
+ print("You can ask questions about the books in the collection.")
60
+
61
+ # Main loop for user interaction
62
+ while True:
63
+ # Get user query
64
+ query = input(colored("\nEnter your query (or 'quit' to exit): ", "yellow"))
65
+
66
+ # Check if user wants to quit
67
+ if query.lower() == 'quit':
68
+ break
69
+
70
+ # Get number of desired results
71
+ n_results = int(input(colored("How many results would you like? ", "yellow")))
72
+
73
+ # Query the collection
74
+ results = query_collection(query, n_results)
75
+
76
+ # Print the results
77
+ print(colored("\nTop matching chunks:", "green"))
78
+ for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0]), 1):
79
+ print(colored(f"\nResult {i}:", "blue"))
80
+ print(f"Book: {metadata['book_title']}")
81
+ print(f"Chunk Index: {metadata['chunk_index']}")
82
+ print(colored("Content:", "cyan"))
83
+ print(doc)
84
+
85
+ # Save results to JSON file
86
+ save_results_to_json(results, query, n_results)
87
+ print(colored("\nResults have been saved to query_results.json", "green"))
88
+
89
+ # Print goodbye message
90
+ print(colored("Thank you for using the Book Query System!", "green"))
91
+
92
+ # Check if the script is being run directly
93
+ if __name__ == "__main__":
94
+ main()
6_chat_with_a_library.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import os # For environment variables
3
+ import json # For JSON operations
4
+ import chromadb # For vector database operations
5
+ from chromadb.utils import embedding_functions # For creating embedding functions
6
+ from termcolor import colored # For colored console output
7
+ from openai import OpenAI # For interacting with OpenAI's API
8
+
9
+ # Function to query the vector database
10
+ def query_collection(query, n_results):
11
+ # Initialize the Chroma client with a persistent database
12
+ client = chromadb.PersistentClient(path="./chroma_db")
13
+ # Create an OpenAI embedding function
14
+ openai_ef = embedding_functions.OpenAIEmbeddingFunction(
15
+ api_key=os.getenv("OPENAI_API_KEY"),
16
+ model_name="text-embedding-3-large"
17
+ )
18
+ # Get the collection from the database
19
+ collection = client.get_collection(
20
+ name="books_collection",
21
+ embedding_function=openai_ef,
22
+ )
23
+ # Query the collection with the given query and number of results
24
+ results = collection.query(
25
+ query_texts=[query],
26
+ n_results=n_results
27
+ )
28
+ return results
29
+
30
+ # Function to save retrieved chunks to a JSON file
31
+ def save_chunks_to_json(results, query):
32
+ # Prepare the data structure for JSON
33
+ json_data = {
34
+ "query": query,
35
+ "chunks": []
36
+ }
37
+ # Iterate through the results and add them to the data structure
38
+ for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
39
+ chunk = {
40
+ "book_title": metadata['book_title'],
41
+ "chunk_index": metadata['chunk_index'],
42
+ "content": doc
43
+ }
44
+ json_data["chunks"].append(chunk)
45
+
46
+ # Write the data to a JSON file
47
+ with open("retrieved_chunks.json", "w") as f:
48
+ json.dump(json_data, f, indent=4)
49
+
50
+ # Print a confirmation message
51
+ print(colored("Retrieved chunks have been saved to retrieved_chunks.json", "green"))
52
+
53
+ # Function to generate a response using OpenAI's API
54
+ def generate_response(client, query, context):
55
+ # Prepare the messages for the chat completion
56
+ messages = [
57
+ {"role": "system", "content": "You are a helpful assistant. Answer the query based on the provided context. in 100 words or less. unless user asks for more detail. Do not refer to any author but just combine the best pieces of the provided content to answer the query. Only answer based on the provided content."},
58
+ {"role": "user", "content": f"Context:\n{context}\n\nQuery: {query}"}
59
+ ]
60
+ # Create a streaming chat completion
61
+ stream = client.chat.completions.create(
62
+ model="gpt-4o",
63
+ messages=messages,
64
+ stream=True
65
+ )
66
+ # Print the response as it's generated
67
+ for chunk in stream:
68
+ if chunk.choices[0].delta.content is not None:
69
+ print(colored(chunk.choices[0].delta.content, "green"), end='', flush=True)
70
+
71
+ print()
72
+
73
+ # Main function to run the program
74
+ def main():
75
+ # Print welcome message
76
+ print(colored("Welcome to the Book Query System!", "green"))
77
+ # Initialize OpenAI client
78
+ client = OpenAI()
79
+ # Get the number of chunks to use for context from the user
80
+ n_results = int(input(colored("How many chunks would you like to use for context? ", "yellow")))
81
+
82
+ # Main loop for user interaction
83
+ while True:
84
+ # Get the user's query
85
+ query = input(colored("\nEnter your query (or 'quit' to exit): ", "yellow"))
86
+ if query.lower() == 'quit':
87
+ break
88
+
89
+ # Query the collection
90
+ results = query_collection(query, n_results)
91
+
92
+ # Save the retrieved chunks to a JSON file
93
+ save_chunks_to_json(results, query)
94
+
95
+ # Prepare the context for the AI response
96
+ context = "\n\n".join([
97
+ f"Book: {metadata['book_title']}\nChunk Index: {metadata['chunk_index']}\nContent: {doc}"
98
+ for doc, metadata in zip(results['documents'][0], results['metadatas'][0])
99
+ ])
100
+
101
+ # Generate and print the AI response
102
+ print(colored("\nGenerating response based on the context...", "green"))
103
+ generate_response(client, query, context)
104
+
105
+ # Print goodbye message
106
+ print(colored("Thank you for using the Book Query System!", "green"))
107
+
108
+ # Run the main function if this script is executed directly
109
+ if __name__ == "__main__":
110
+ main()
count_tokens.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tiktoken
4
+
5
+ def count_tokens(text, encoding_name="cl100k_base"):
6
+ encoding = tiktoken.get_encoding(encoding_name)
7
+ return len(encoding.encode(text))
8
+
9
+ def count_words(text):
10
+ return len(text.split())
11
+
12
+ def process_books(input_folder, output_file):
13
+ all_counts = {}
14
+ total_tokens = 0
15
+ total_words = 0
16
+
17
+ for filename in os.listdir(input_folder):
18
+ if filename.endswith('.txt'):
19
+ with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as file:
20
+ content = file.read()
21
+ tokens = count_tokens(content)
22
+ words = count_words(content)
23
+ all_counts[filename] = {"tokens": tokens, "words": words}
24
+ total_tokens += tokens
25
+ total_words += words
26
+
27
+ # Save token counts, word counts, and totals to a JSON file
28
+ all_counts["total_tokens"] = total_tokens
29
+ all_counts["total_words"] = total_words
30
+ with open(output_file, 'w', encoding='utf-8') as outfile:
31
+ json.dump(all_counts, outfile, indent=4, ensure_ascii=False)
32
+
33
+ print(f"Total tokens across all books: {total_tokens}")
34
+ print(f"Total words across all books: {total_words}")
35
+ print(f"Token and word counts for each book and totals have been saved to {output_file}")
36
+
37
+ # Usage
38
+ input_folder = 'processed_books'
39
+ output_file = 'token_and_word_counts.json'
40
+
41
+ process_books(input_folder, output_file)
license_.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ all software provided by echohive is provided "as is", without warranty of any kind, express or
3
+ implied, including but not limited to the warranties of merchantability,
4
+ fitness for a particular purpose and noninfringement. in no event shall the
5
+ authors or copyright holders be liable for any claim, damages or other
6
+ liability, whether in an action of contract, tort or otherwise, arising from,
7
+ out of or in connection with the software or the use or other dealings in
8
+ the software.
9
+
10
+ the software shall not be used for any malicious purposes
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
- streamlit
2
- openai
 
 
1
+ openai==1.34.0
2
+ chromadb==0.4.24
3
+ termcolor==2.4.0
token_counts.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "A little philosophy of life.txt": 9132,
3
+ "A System of Logic, Ratiocinative and Inductive.txt": 546472,
4
+ "A Treatise of Human Nature.txt": 296762,
5
+ "Aesthetical Essays of Friedrich Schiller.txt": 189986,
6
+ "Ainsi Parlait Zarathoustra (French).txt": 194454,
7
+ "Also sprach Zarathustra Ein Buch für Alle und Keinen (German).txt": 169738,
8
+ "An Enquiry Concerning Human Understanding.txt": 82190,
9
+ "An Enquiry Concerning the Principles of Morals.txt": 70435,
10
+ "Anthropology [a lecture delivered at Columbia University in the series on science, philosophy and art, December 18, 1907].txt": 13097,
11
+ "Aphorismen zur Lebensweisheit (German).txt": 140512,
12
+ "Apology.txt": 24980,
13
+ "Aristotle on the art of poetry.txt": 30192,
14
+ "Autobiography.txt": 102169,
15
+ "Beyond Good and Evil.txt": 94142,
16
+ "Bushido, the Soul of Japan.txt": 51071,
17
+ "Considerations on Representative Government.txt": 125315,
18
+ "Democracy and Education An Introduction to the Philosophy of Education.txt": 176129,
19
+ "Der Wille zur Macht Eine Auslegung alles Geschehens (German).txt": 224176,
20
+ "Dialogues Concerning Natural Religion.txt": 52536,
21
+ "Discours de la méthode (French).txt": 204766,
22
+ "Discourse on the Method of Rightly Conducting One's Reason and of Seeking Truth in the Sciences.txt": 32497,
23
+ "Essais de Montaigne (self-édition) - Volume I (French).txt": 582802,
24
+ "Essays of Schopenhauer.txt": 102591,
25
+ "Ethics.txt": 125702,
26
+ "Euthyphro.txt": 17073,
27
+ "Fables de La Fontaine (French).txt": 165627,
28
+ "Fathers and Sons.txt": 117927,
29
+ "Fundamental Principles of the Metaphysic of Morals.txt": 42791,
30
+ "Hegel's Philosophy of Mind.txt": 172635,
31
+ "Ion.txt": 12818,
32
+ "La Pensée de l'Humanité (French).txt": 157229,
33
+ "Laughter An Essay on the Meaning of the Comic.txt": 56658,
34
+ "Man and Superman A Comedy and a Philosophy.txt": 101410,
35
+ "Mysticism and Logic and Other Essays.txt": 100246,
36
+ "Nature.txt": 24614,
37
+ "Nursing as Caring A Model for Transforming Practice.txt": 56142,
38
+ "On Liberty.txt": 71629,
39
+ "On the Nature of Things.txt": 115376,
40
+ "Parmenides.txt": 50415,
41
+ "Pascal's Pensées.txt": 162822,
42
+ "Phaedo.txt": 59297,
43
+ "Philosophiae Naturalis Principia Mathematica (Latin).txt": 276663,
44
+ "Politics A Treatise on Government.txt": 136638,
45
+ "Pragmatism A New Name for Some Old Ways of Thinking.txt": 74273,
46
+ "Principios e questões de philosophia politica (Vol. 1 of 2) (Portuguese).txt": 95611,
47
+ "Protagoras.txt": 40959,
48
+ "Second Treatise of Government.txt": 78427,
49
+ "Selections from the Writings of Kierkegaard.txt": 118409,
50
+ "Siddhartha eine indische Dichtung (German).txt": 71013,
51
+ "Siddhartha.txt": 57396,
52
+ "Six metaphysical meditations.txt": 57199,
53
+ "Sophist.txt": 66723,
54
+ "Symbolic Logic.txt": 131562,
55
+ "Symposium.txt": 47194,
56
+ "The Advancement of Learning.txt": 121880,
57
+ "The Analects of Confucius (from the Chinese Classics).txt": 48003,
58
+ "The Analysis of Mind.txt": 119143,
59
+ "The Antichrist.txt": 53358,
60
+ "The Birth of Tragedy; or, Hellenism and Pessimism.txt": 80627,
61
+ "The Case of Wagner, Nietzsche Contra Wagner, and Selected Aphorisms..txt": 45557,
62
+ "The Communist Manifesto.txt": 19440,
63
+ "The Consolation of Philosophy.txt": 64645,
64
+ "The Critique of Practical Reason.txt": 85468,
65
+ "The Critique of Pure Reason.txt": 279141,
66
+ "The Elementary Forms of the Religious Life.txt": 309826,
67
+ "The Ethics of Aristotle.txt": 153589,
68
+ "The Golden Sayings of Epictetus, with the Hymn of Cleanthes.txt": 36620,
69
+ "The Joyful Wisdom (La Gaya Scienza).txt": 126443,
70
+ "The Kybalion.txt": 49626,
71
+ "The Law.txt": 30680,
72
+ "The Life of Reason The Phases of Human Progress.txt": 426776,
73
+ "The Lives and Opinions of Eminent Philosophers.txt": 284048,
74
+ "The Oxford Book of American Essays.txt": 224431,
75
+ "The Picture of Dorian Gray.txt": 110023,
76
+ "The Poetics of Aristotle.txt": 24964,
77
+ "The Prince.txt": 71159,
78
+ "The Problems of Philosophy.txt": 59012,
79
+ "The Republic.txt": 282758,
80
+ "The Sayings of Confucius.txt": 52309,
81
+ "The Secret Doctrine, Vol. 1 of 4.txt": 494622,
82
+ "The Secret Doctrine, Vol. 3 of 4.txt": 385066,
83
+ "The symbolism of Freemasonry Illustrating and explaining its science and philosophy, its legends, myths and symbols.txt": 148348,
84
+ "The Tao Teh King, or the Tao and its Characteristics.txt": 18758,
85
+ "The Theory of Moral Sentiments.txt": 158439,
86
+ "The Twilight of the Idols; or, How to Philosophize with the Hammer. The Antichrist.txt": 103709,
87
+ "The Varieties of Religious Experience A Study in Human Nature.txt": 267612,
88
+ "The Will to Believe, and Other Essays in Popular Philosophy.txt": 145720,
89
+ "The Will to Power An Attempted Transvaluation of All Values. Book I and II.txt": 129710,
90
+ "The Will to Power An Attempted Transvaluation of All Values. Book III and IV.txt": 144631,
91
+ "The Works of Edgar Allan Poe — Volume 5.txt": 116428,
92
+ "The World as Will and Idea (Vol. 1 of 3).txt": 258295,
93
+ "The World as Will and Idea (Vol. 3 of 3).txt": 288437,
94
+ "Theaetetus.txt": 93043,
95
+ "Thus Spake Zarathustra A Book for All and None.txt": 165671,
96
+ "Utilitarianism.txt": 39631,
97
+ "Voltaire's Philosophical Dictionary.txt": 120793,
98
+ "What I believe.txt": 20310,
99
+ "What Is Art.txt": 115537,
100
+ "total_tokens": 12752908
101
+ }