# Lecture2HelloWorldAndExpressions.py
print('Hello, world!')
print('Hello 1')
print('Hello 2')
print('Hello 3')
print('Hello, world!')
print(Hello, world!)
print(Hello, world!) # Intentionally creates an error!
print(1)         # Technically an expression
print(1+2)       # Two operands and an operator make an expression
print(10*(10+1)) # The expression (10+1) acting as an operand
print(3 + 8 / 2)  # What do you predict?
print(4 * 2 + 3 + 5 * 2)  # And this one?
print('Hello', 'world', '!')
print(max(2,5,7))
print(max(2,7) + max(3,9)) # Using function calls as operands
print(max(2,7) + max(3,9)) # Calc 7, calc 9, then add
1
2
3
max(2,7)
None
print(2) + 2
print('Hello, world!')
max(2 ** 8, 3 ** 6, 5 ** 3)
1.0000000000000001 - 1
print(type(-100)) # int
print(type(10.1)) # float
print(type('A'))  # str
print(type(True)) # bool
print(type('10')) # str
print(type(10))   # int
print(type(10.0)) # float
print(type(True)) # bool
0.1 + 0.1 + 0.1
'Hello ' + 1111
'Hello ' + 'world' + '!'
'Hello ' + str(1111)
20 * 9/5 + 32
print('Temp: 68.0 F')
print('Temp: ' + 20 * 9/5 + 32 + ' F')
print('Temp: ' + str(20 * 9/5 + 32) + ' F')
# Lecture3VariablesAndConditions.py
two_to_the_eighth = 2 ** 8
print(two_to_the_eighth)
two_to_the_eighth * 2
pay_per_hour = 18
pay_per_hour = 20  # Pay raise!
print(pay_per_hour)
counter = 0
counter = counter + 1 # It's an instruction, not an equality!
print(counter)
counter = counter + 1
print(counter)
pay_per_hour = 20
hours = 40
total_pay = pay_per_hour * hours
print(total_pay)
Pay_Per_Hour = 15   # please avoid this capitalization style!
print(pay_per_hour) # remembers the lowercase value
silent_assignment = 0
20 = pay_per_hour
print(undefined_var + 7)
color = input('What is your favorite color? ')
print('Yeah, ' + color + ' is pretty great!')
to_square_str = input('What should I square? ')
print(int(to_square_str) ** 2)
city = input('What city are we in? ')
print(city == 'Boston')
    
answer = input('What is 2+2? ')
print(answer == 4) # not going to work
answer == '4'   # but this works
int(answer) == 4 # or this
float(answer) == 4 # or even this
print(1 < 1)
print(1 > 1)
print(1 != 1)
print(1 <= 1)
print(1 >= 1)
print('aardvark' < 'zebra')
print('capitalized' == 'Capitalized')
2 + 5 > 7 - 4  # 5 > 7 would be false, but (2+5) > (7-4) is True
total = 0
value_str = input('Enter a value: ')
value_int = int(value_str)
if value_int < 0:
    print('Sorry, that was a negative value.')
else:
    total = total + value_int
print(total)
if condition:
    statement_if_true1
    statement_if_true2
    statement_if_true3
    ...
else:
    statement_if_false1
    statement_if_false2
    ...
statement_regardless1
statement_regardless2
...
value = int(input('Enter an integer:'))
if value < 0:
    print('Negative')
else:
    print('Positive')
print('Done')
password = input('Enter the password: ')
if password == '1234':
    print('Correct!')
    print('Your account has $1000000 in it.')
else:
    print('Incorrect.')
print('Have a nice day.')
num1_str = input('Enter an integer: ')
num2_str = input('Enter a different integer: ')
num1_int = int(num1_str)
num2_int = int(num2_str)
if num1_str == num2_str:
    print('The numbers were supposed to be different...')
    print('But you entered ' + num1_str + ' twice!')
else:
    print(num2_str + ' divided by ' + num1_str + ' is...')
    print(num2_int / num1_int)  # Divide by zero would be error, btw
print('Done...')
language = input('What is your favorite language? ')
if language == 'Python':
    print('Mine too!')
print('But there sure are a lot of languages out there....')
value = int(input('Enter an integer between 0 and 100: '))
if value < 0:
    print('No negative numbers!')
elif value > 100:
    print('That value is too large!')
elif value == 42:
    print('That was the number I was thinking of!')
else:
    print('Guess again.')
    
value = int(input('Enter an integer between 0 and 100: '))
if value < 0:
    print('No negative numbers!')
elif value > 100:
    print('That value is too large!')
elif value >= 50:
    print('Big!')
else:
    print('Small!')
value = int(input('Enter an integer between 0 and 100: '))
if value < 0:
    print('No negative numbers!')
else:
    if value > 100:
        print('That value is too large!')
    else:
        if value >= 50:
            print('Big!')
        else:
            print('Small!')
age = int(input('Enter your age: '))
if age < 18:
    if age < 5:
        print('Just a toddler, then.')
    elif age < 12:
        print('Not quite a teenager, then.')
    else:
        print('Teenage years ... a difficult time!')
else:
    print('An adult, then.')
    if age >= 55:
        print('And a senior citizen, too!')
num1 = int(input('First number: '))
num2 = int(input('Second number: '))
num3 = int(input('Third number: '))
my_max = max(num1, num2, num3)
my_min = min(num1, num2, num3)
my_mean = (num1+num2+num3)/3  # Note importance of parens!
print('Min: ' + str(my_min))
print('Max: ' + str(my_max))
print('Mean: ' + str(my_mean))
if num1 == num2:
    print(str(num1) + ' was repeated')
elif num2 == num3:
    print(str(num2) + ' was repeated')
elif num1 == num3:
    print(str(num3) + ' was repeated')
else:
    print('The numbers were unique')
# Lecture4WhileAndLists.py
string = input('Enter a number: ')
while string != 'stop':
    print(string + ' squared is ' + str(int(string) ** 2))
    string = input('Enter a number: ')
print('Done.')
counter = 0
while counter < 21:
    print(counter)
    counter = counter + 1
print(counter)
counter = 1
print('We will now iterate three times...')
while counter < 4:
    print('Iteration ' + str(counter))
    counter = counter + 1
total = 0
count = 0
value_str = input('Enter a number, or "done" if done: ')
while value_str != 'done':
    count = count + 1
    value_int = int(value_str)
    total = total + value_int
    value_str = input('Enter a number, or "done" if done: ')
if count > 0:
    print('The average is ' + str(total/count))
total = 0
count = 0
value_str = input('Enter a number, or "done" if done: ')
while value_str != 'done':
    count += 1
    value_int = int(value_str)
    total += value_int
    value_str = input('Enter a number, or "done" if done: ')
if count > 0:
    print('The average is ' + str(total/count))
while(True):
    input('Enter any input to get a compliment: ')
    print('That is so clever of you!')
my_list = ['duck', 'duck', 'goose']  # A list with 3 items
print(my_list[0])
print(my_list[1])
print(my_list[2])
my_list = ['duck', 'duck', 'goose']
my_list[2] = 'bear'
print(my_list)
my_list = [1, 2, 3]
my_list.append(4)
print(my_list)  # my_list has changed...
print(my_list.append(5))
print(my_list)
shopping_list = []
item = input('Add an item to the shopping list (or "done"): ')
while item.lower() != 'done':
    shopping_list.append(item)
    item = input('Add an item to the shopping list (or "done"): ')
print('Okay, so that was: ')
print(shopping_list)
[1, 2, 3] + [4, 5, 6]
print(len('Hello'))
print(len([1, 2, 3]))
my_items = ['eggs', 'flour', 'milk']
print(len(my_items), 'items')
print(my_items[2])
print(my_items[len(my_items)-1])
planet_diameter_km = [4879, 12104, 12756, 6792, 142984, 120536, 51118, 49528, 2377]
planet_diameter_km.sort()
planet_diameter_km
my_list1 = [3, 2, 1]
my_list2 = my_list1
my_list1.sort()
print(my_list1)
print(my_list2)
my_list1 = [3, 2, 1]
my_list2 = my_list1.copy()
my_list1.sort()
print(my_list1)
print(my_list2)
honors = ['Albert', 'Berenice', 'Chen', 'Dominique']
mentioned_honors = []
nonhonors = []
student = input('Enter a name (or "done"): ')
while (student != 'done'):
    if student in honors:
        print('Honors!')
        mentioned_honors.append(student)
    else:
        print('Not honors...')
        nonhonors.append(student)
    student = input('Enter a name (or "done"): ')
print('Honors mentioned: ' + str(mentioned_honors))
print('Nonhonors mentioned: ' + str(nonhonors))
# Lecture5MorePower.py
percent = input('Enter a percentage between 0 and 100:')
if float(percent) >= 0 and float(percent) <= 100:
    if float(percent) >= 10:
        print('A decent return on investment....')
    else:
        print('Not a great return on investment....')
else:
    print('That is not in the requested range!')
vip = False
spent = 10
if vip or spent >= 10000:
    print('Send this person a loyalty reward!')
else:
    print('This person deserves nothing!')
vip = False
if not vip:
    print('Have you considered signing up to join the VIP program?')
else:
    print('Welcome back, VIP customer!')
vip = False
spent = 0
if not vip or spent < 10000:  # "not" applied to vip before "or"
    print('Please spend more')
else:
    print('Hello, valued patron!')
vip = False
spent = 0
if not (vip or spent < 10000): # within parens evaluates to True
    print('Please spend more')
else:
    print('Hello, valued patron!')
my_list = [1,2,3]
my_list2 = [7,8,9]
if not 4 in my_list and not 4 in my_list2:
    print('No 4 found')
my_list = [1,2,3]
my_list2 = [7,8,9]
if 4 not in my_list and not in my_list2:
    print('This will actually cause an error - not how "in" works')
import math
math.sqrt(2)
import math as m
m.sqrt(2)
from math import sqrt as my_sqrt
my_sqrt(2)
get_ipython().system('python3 -m ensurepip --upgrade')
get_ipython().system('pip install seaborn')
import seaborn as sns
df = sns.load_dataset("penguins") # Load a dataset about penguins
sns.jointplot(data=df, x="flipper_length_mm", y="bill_length_mm", hue="species")
import statistics
statistics.median([1, 2, 3, 4])
import statistics
statistics.median([1, 2, 3, 4])
total = 0
count = 0
value_str = input('Enter a number, or "done" if done: ')
while value_str != 'done':
    count = count + 1
    value_int = int(value_str)
    total = total + value_int
    value_str = input('Enter a number, or "done" if done: ')
if count > 0:
    print('The average is ' + str(total/count))
total = 0
count = 0
value_str = input('Enter a non-negative integer, or "done" if done: ')
while value_str != 'done':
    if not value_str.isdigit():
        print('Non-negative integers only!')
    else:
        count = count + 1
        value_int = int(value_str)
        total = total + value_int
    value_str = input('Enter a non-negative integer, or "done" if done: ')
if count > 0:
    print('The average is ' + str(total/count))
total = 0
count = 0
value_str = input('Enter a number, or "done" if done: ')
while value_str != 'done':
    count = count + 1
    value_int = int(value_str)
    total = total + value_int
    print(value_str)
if count > 0:
    print('The average is ' + str(total/count))
3 = my_list
total = 0
count = 0
value_str = input('Enter a number, or "done" if done: ')
count = count + 1
value_int = int(value_str)
total = total + value_int
if count > 0:
    print('The average is ' + str(total/count))
# Lecture6and7Iteration.py
people = ['Alice', 'Bob', 'Che']
index = 0
while index < len(people):
    person = people[index]
    print('Hooray for ' + person + '!')
    index += 1
people = ['Alice', 'Bob', 'Che']
for person in people:
    print('Hooray for ' + person + '!')
running_total = 0
numbers = [1,2,3,4,10]
for n in numbers:
    running_total = running_total + n  # Could be abbreviated running_total += n
    print('Sum so far: ' + str(running_total))
print('Sum: ' + str(running_total))
my_grades = [4, 3, 2, 3, 4]
letter_grades = []
for g in my_grades:
    if g == 4:
        letter_grades.append('A')
    elif g == 3:
        letter_grades.append('B')
    elif g == 2:
        letter_grades.append('C')
print(letter_grades)
temps_f = [36, 39, 45, 56, 66, 76, 81, 80, 72, 61, 51, 41] # Jan through Dec
temps_c = []
for t in temps_f:
    degrees_c = (t - 32)*5/9
    temps_c.append(round(degrees_c, 2)) # Round to 2 decimal places
temps_c
my_car = ("Honda Fit", 2010, 30, 10000)
print(my_car)
car_type, year, mpg, price = my_car
print(mpg)
print(my_car[0] + ' prints successfully')  # OK
my_car[0] = 'bad value' # Not OK, trying to change the tuple
my_movies = [("No", 4), ("Rogue One", 4.5), ("Casablanca", 5)]
for moviename, stars in my_movies:  # Notice the two variable names
    print ('I would rate ' + moviename + ' ' + str(stars) + ' stars')
my_movies = [("No", 4), ("Rogue One", 4.5), ("Casablanca", 5)]
best_rating = 0 # Initialize with a value that is definitely beat
best_movie = "none"
for movie, rating in my_movies:
    if rating > best_rating:
        best_rating = rating
        best_movie = movie
print("Best movie: " + best_movie + "...rating = " + str(best_rating))
movies = ['Fall Guy', 'Free Guy', 'Cable Guy']
ratings = [5, 4, 3]
for movie, rating in zip(movies, ratings):
    print("I'd rate " + movie + " a " + str(rating))
sw_movies = [('The Phantom Menace', 52),
('Attack of the Clones', 65),
('Revenge of the Sith', 80),
('Rogue One', 84),
('Solo', 70),
('Star Wars', 92),
('The Empire Strikes Back',94),
('Return of the Jedi', 82),
('The Force Awakens', 93),
('The Last Jedi', 90),
('The Rise of Skywalker', 51)]
my_list = []
for movie, score in sw_movies:
  if score >= 80:
    my_list.append(movie)
print(my_list)
for i in range(5):
  print ("Iteration " + str(i))
for i in range(1,6):
    print(i)
my_itinerary = ['Boston', 'Atlanta', 'LA', 'Seattle']
for idx in range(len(my_itinerary)-1):  # Avoid indexing out of bounds
    print(my_itinerary[idx] + '-' + my_itinerary[idx+1])
names = ['Alice', 'Bob', 'Charlie', 'Dora']
for number, name in enumerate(names):
    print(name + ' ' + str(number))
for movie, rating in sw_movies:
    print('Looking at ' + movie)
    if movie == 'Rogue One':
        print('The rating of Rogue One is ' + str(rating))
        break  # We don't need to look at any other entries
print('Done')
my_two_stock_histories = [[40.1, 40.2, 39.9, 40.2],
                         [100.2, 99.9, 100.0, 103.1]]
my_two_stock_histories = [[40.1, 40.2, 39.9, 40.2],
                         [100.2, 99.9, 100.0, 103.1]]
my_two_stock_histories[1]
my_two_stock_histories = [[40.1, 40.2, 39.9, 40.2],
                         [100.2, 99.9, 100.0, 103.1]]
my_two_stock_histories[1][2]
my_stock_histories = my_two_stock_histories.copy()
my_stock_histories.append([5.0, 9.0, 6.0, 7.0])
print(my_stock_histories)
print('Stock 0 closing prices: ')
for price in my_stock_histories[0]:
    print(price)
print('Starting prices for all stocks:')
for stock_list in my_stock_histories:
    print(stock_list[0])
letters = ['a', 'b', 'c','d','e','f','g','h','i','j']
print('All possible coordinates in Battleship:')
for l in letters:
    for n in range(1,11):
        print(l + str(n))
bills = [[1, 2, 3], [4,5,6], [7,8,9]]
my_totals = [] # empty list
for l in bills:
  print('new list')
  listsum = 0
  for l2 in l: # iterating over the list we got from the outer foreach
    print('adding ' + str(l2))
    listsum += l2
  my_totals.append(listsum)
print('Bill sums:' + str(my_totals))
print('Possible matchups:')
players = ['Alice', 'Bobby', 'Caspar', 'Dmitri', 'Eve']
for white_player in players:
  for black_player in players:
    print("White: " + white_player + "; Black player: " + black_player)
print('Possible matchups:')
players = ['Alice', 'Bobby', 'Caspar', 'Dmitri', 'Eve']
for white_player in players:
  for black_player in players:
    if not white_player == black_player:
        print("White: " + white_player + "; Black player: " + black_player)
my_multiples_of_3 = [v * 3 for v in range(5)]
my_multiples_of_3
unrounded = [1.9, 5.3, 9.9]
rounded  = [round(i,0) for i in unrounded]
rounded
unrounded = [1.9, 5.3, 9.9]
rounded = []
for item in unrounded:
    rounded.append(round(item,0))
print(rounded)
temps_f = [36, 39, 45, 56, 66, 76, 81, 80, 72, 61, 51, 41] # Jan through Dec
temps_c = [round((t-32)*5/9,2) for t in temps_f]
temps_c
times = [(2,30), (4,10), (1, 30), (0,40), (0, 20)]
minutes = [t[0]*60 + t[1] for t in times]
minutes
# Lecture8and9Functions.py
def add_an_s(string):
    new_string = string + 's'
    return new_string
add_an_s('example') + '!'
records = read_customer_data('input.csv')
sales = 0
purchase_counts = []
s_names = []
for record in records:
    name, purchase_list, sale_info = parse_record(record)
    s_names.append(standardize_name(name))
    sales = update_total_sales(sales, sale_info)
    update_purchase_counts(purchase_counts, purchase_list)
write_to_file(s_names, purchase_counts, sales, 'output.csv')
def add_two(my_number):
  # Adds two to the argument.
  return my_number + 2
add_two(2)
def count_matches(to_match, my_list):
  # Counts how many times to_match appears in my_list
  count = 0
  for m in my_list:
    if to_match == m:
      count += 1
  return count
print(count_matches(5, [5, 6, 7, 5]))
print(count_matches("foo", ["foo","bar","baz"]))
def percent_gain(start, finish):
    return (finish-start)/start * 100
print(percent_gain(36585.06, 33147.25))
print(percent_gain(4796.56, 3839.50))
print(percent_gain(15832.80, 10466.48))
def get_rating(movie_tuple):
    # More readable way to access a movie rating
    return movie_tuple[1]
get_rating(('Portrait of a Lady on Fire', 5))
def with_tax(price, tax):
    return round(price * (1 + tax * .01), 2)
with_tax(1,8.6)
from datetime import date
def greet_user():
  print("Hello, user!")
  print("Today's date is " + str(date.today()))
greet_user()
def greet_user():
  print("Hello, user!")
  print("Today's date is " + str(date.today()))
  return
print(greet_user())
def longest_customer_name(list_of_names):
    # Find the longest customer name, and how long it is
    # (maybe so we can display the names nicely later)
    longest_len = 0
    longest_name = ""
    for n in list_of_names:
        if len(n) > longest_len:
            longest_len = len(n)
            longest_name = n
    return longest_name, longest_len
name, length = longest_customer_name(['Alice', 'Bob', 'Cassia'])
print(name)
print(length)
from statistics import mean
def min_mean_max(L):
    return min(L), mean(L), max(L)
min_mean_max([1,2,3,4,5])
def count_items(lst):
    # Count items but warn if the list is empty
    if (len(lst) == 0):
        print('Warning: empty list passed to count_items!')
        return 0
    print("We don't get here with an empty list")
    return len(lst)
count_items([])
def is_prime(n):
    for i in range(2, n): # Look for a divisor
        if n % i == 0:    # i divides n evenly, no remainder
            return False
    return True           # didn't find a divisor
print(is_prime(11))
print(is_prime(4))
def longest_customer_name(list_of_names):
    # Find the longest customer name, and how long it is
    # (maybe so we can display the names nicely later)
    longest_len = 0
    longest_name = ""
    for n in list_of_names:
        if len(n) > longest_len:
            longest_len = len(n)
            longest_name = n
    return longest_name, longest_len
def count_matches(to_match, my_list):
  # Counts how many times to_match appears in my_list
  count = 0
  for m in my_list:
    if to_match == m:
      count += 1
  return count
def count_longest_name(list_of_names):
    # Count how many times the longest name appears in the list
    # Makes use of functions defined above
    word, length = longest_customer_name(list_of_names)
    return count_matches(word,list_of_names)
count_longest_name(['Alice','Bob','Catherine','Catherine'])
def all_names_short_enough1(names, limit):
    for name in names:
        if len(name) > limit:
            return False
    return True
print(all_names_short_enough1(['Alice', 'Bob'], 3))
print(all_names_short_enough1(['Alice', 'Bob'], 5))
def all_names_short_enough2(names, limit):
    name, length = longest_customer_name(names)
    return length <= limit
print(all_names_short_enough2(['Alice', 'Bob'], 3))
print(all_names_short_enough2(['Alice', 'Bob'], 5))
def add5(arg):
    b = arg + 5
    return b
add5(7) # Return 12
def pattern_a(price, tax):
  return price * (1 + 0.01 * tax)  # Everything we need is in the arguments - good
tax = 20 # Global variable - this is worse style
def pattern_b(price):
  return price * (1 + 0.01 * tax) # Works, but less flexible, hard to debug
print(pattern_a(100,20))
print(pattern_b(100))
def add_two(my_number):
  a = my_number + 2 # Shadows outer "a", now we have two a's and see this one
  print("a is " + str(a) + " inside add_two")
  return a
a = 5
print("add_two(2) is " + str(add_two(2)))
print("a is " + str(a) + " outside add_two")
my_list = ['a','b','c']
def concatenate_all(my_list):
    out = ''
    for item in my_list:
        out += item
    return out
print(concatenate_all(['d','e'])) # ['d','e'] is called my_list in the function
print(concatenate_all(my_list))  # my_list is still a,b,c
names = ["Catherine", "Donovan", "alice", "BOB"]
standardized_names = []
for name in names:
    name = name.capitalize() # Capitalize first letter, lc others
    standardized_names.append(name) 
    standardized_names.sort()    
jobs = ['Pilot', 'teacheR', 'firefighter', 'LIBRARIAN']
standardized_jobs = []
for job in jobs:
    job = job.capitalize()
    standardized_jobs.append(job)
    standardized_jobs.sort()
print(standardized_names)
print(standardized_jobs)
names = ["Catherine", "Donovan", "alice", "BOB"]
jobs = ['Pilot', 'teacheR', 'firefighter', 'LIBRARIAN']
def standardize_strings(string_list):
    out = []
    for s in string_list:
        s = s.capitalize()
        out.append(s)
    out.sort()
    return out
standard_names = standardize_strings(names)
standard_jobs = standardize_strings(jobs)
print(standard_names)
print(standard_jobs)
def get_first_letter(word):
  """ Returns the first letter of a string.
  word (str):  The string to get the letter from.
  A simple function just for demo purposes.  Probably
  not useful since get_first_letter takes more characters
  to type than string[0].
  """
  return word[0]
get_ipython().run_line_magic('pinfo', 'get_first_letter')
print(get_first_letter("Shibboleth") == "S")
print(pattern_a(100,20) == 120)
print(pattern_a(0, 20) == 0)
print(count_matches("A",[]) == 0)
print(count_matches("A", ["A","A","A"]) == 3)
# Lecture10Hashes.py
my_menu_dict = {
    "Salmon": 25,
    "Steak": 30,
    "Mac and cheese" : 18
}
print(my_menu_dict["Salmon"])
my_menu_dict = {} # empty dictionary
my_menu_dict["Salmon"] = 25
my_menu_dict["Steak"] = 30
my_menu_dict["Mac and cheese"] = 18
print(my_menu_dict["Salmon"])
my_dict = {}
my_dict.get('sushi', 0)
two_cities = """It was the best of times, it was the worst of times,
 it was the age of wisdom, it was the age of foolishness, it was the epoch of belief,
 it was the epoch of incredulity, it was the season of light, it was the season of darkness,
 it was the spring of hope, it was the winter of despair."""
worddict = {}
wordlist = two_cities.split()
for word in wordlist:
  if word in worddict:  # Check for presence of key
    worddict[word] += 1
  else:
    worddict[word] = 1
print(worddict["age"])
print(worddict["of"])
for word, count in worddict.items():
  print(word + ":" + str(count))
def word_prob(word, worddict):
    numerator = worddict.get(word, 0)
    denominator = 0
    for word, count in worddict.items():
        denominator += count
    return numerator / denominator
print(word_prob('winter', worddict))  # Should be 1/60 = 0.0167 or so
print(word_prob('season', worddict))  # Should be 2/60 = 0.0333 or so
print(word_prob('Pokemon', worddict))  # Should be 0 with no errors
bigIPs = {"209.85.231.104", "207.46.170.123", "72.30.2.43"}
bigIPs.add("208.80.152.2")
len(bigIPs)
newset = set()
newset.add("First item")
print("First item" in newset)
myset = set(range(123456789))   # {0, 1, 2, ...}
mylist = list(range(123456789)) # [0, 1, 2, ...]
12345678 in myset  # Fast, uses hash
12345678 in mylist # Slower, check each item
two_cities_extended = """It was the best of times,
it was the worst of times, it was the age of wisdom,
it was the age of foolishness, it was the epoch of belief,
it was the epoch of incredulity, it was the season of Light,
it was the season of Darkness, it was the spring of hope,
it was the winter of despair, we had everything before us,
we had nothing before us, we were all going direct to Heaven,
we were all going direct the other way--in short, the period was
so far like the present period that some of its noisiest authorities
insisted on its being received, for good or for evil, in the superlative
degree of comparison only.
There were a king with a large jaw and a queen with a plain face,
on the throne of England; there were a king with a large jaw and a
queen with a fair face, on the throne of France. In both countries
it was clearer than crystal to the lords of the State preserves of
loaves and fishes, that things in general were settled for ever.
It was the year of Our Lord one thousand seven hundred and seventy-five.
Spiritual revelations were conceded to England at that favoured period,
as at this. Mrs. Southcott had recently attained her five-and-twentieth
blessed birthday, of whom a prophetic private in the Life Guards had heralded
the sublime appearance by announcing that arrangements were made for the
swallowing up of London and Westminster. Even the Cock-lane ghost had been
laid only a round dozen of years, after rapping out its messages, as the
spirits of this very year last past (supernaturally deficient in originality)
rapped out theirs. Mere messages in the earthly order of events had lately
come to the English Crown and People, from a congress of British subjects
in America: which, strange to relate, have proved more important to the human
race than any communications yet received through any of the chickens of the
Cock-lane brood. 
"""
wordlist = two_cities_extended.split()
def find_by_list(wordlist):
  for word in wordlist:
    if word in wordlist:
        continue # Move on to next loop
get_ipython().run_line_magic('time', 'find_by_list(wordlist)')
worddict = {}
for word in wordlist:
  if word in worddict:
    worddict[word] += 1
  else:
    worddict[word] = 1
def find_by_dict(wordlist, dict):
  for word in wordlist:
    if word in dict:
      continue # Move on to next iteration of the for loop
get_ipython().run_line_magic('time', 'find_by_dict(wordlist,worddict)')
mydict = {"a":1000}
dict2 = mydict # gets the address, so any changes are permanent to the original
dict2["b"] = 500
print(mydict)
print(dict2)
dict3 = dict2.copy()
dict3["c"] = 40
print(dict2)
print(dict3)
from string import ascii_lowercase
myset = set()
for i in range(len(two_cities_extended)):
  myset.add(two_cities_extended[i].lower())
def checkletters(myset):
  for c in ascii_lowercase:
    # TODO check whether this letter appeared in myset, maybe return a value
    if c not in myset:
      print("Missing: " + c)
      return False
  print("All found")
  return True
checkletters(myset)
# Lecture11and12NumpyMatplotlib.py
import numpy as np
v = np.array([1, 2 ,3])
print(v)
A = np.array([[1, 0, 0],
              [0 ,2, 0],
              [0, 0, 3]])  # 3x3 with 1,2,3 along the diagonal
print(A)
print(A.shape)  # Tuples: like lists, but use () instead of [] 
print(v.shape)  # 1d outputs a comma to indicate it's still a tuple
v1 = v
print(v1)
v2 = np.array([4, 5, 6])
print(v2)
print("Adding 1D arrays: ",  v1 + v2)
print("Subtracting 1D arrays: ",  v1 - v2)
print("Multiplying 1D arrays: ", v1 * v2)
print("Dividing 1D arrays: ", v1 / v2)
print(v1)
print("Adding by a constant: ", v1 + 2)
print("Subtracting by a constant: ", v1 - 2)
print("Multiplying by a constant: ", v1 * 2)
print("Dividing by a constant: ", v1 / 2)
my_array = np.array([[1,2,3], 
                     [4,5,6]])
print(np.min(my_array, axis=0))
print(np.mean(my_array, axis=1))
B = np.array([[3, 2], 
              [4, -1]])
w = np.array([1, -1])
z = B @ w
print(z)
my_array = np.array([8, 6, 7, 5, 3, 0, 9])
print(my_array[1:3]) # prints index 1 and 2, not 3
print(my_array)
print(my_array[1:])
my_array[:3]
my_matrix = np.array([[42.3, 71.1, 92],
                      [40.7, 70.0, 85],
                      [47.6, 122.0, 82]])
print(my_matrix)
two_by_two_square = my_matrix[1:, :2]
print(two_by_two_square)
no_last_column = my_matrix[:, :2] # no temperature
print(no_last_column)
import numpy as np
a = np.array([0, 1, 2, 3, 4, 5])
print(a)
b = a[1:3]
print(b)
b[1] = 100 # modify the slice...
print(a) # ...and see the original change
print(np.zeros(3)) #create an array of zeros with length 3
print(np.zeros((2, 3))) # create a 2x3 matrix of zeros
import matplotlib.pyplot as plt
x = [1, 2, 3]
y = [1, 4, 9]
plt.plot(x, y)
plt.show()
import numpy as np
my_points = np.array([[2, 1], 
                      [3, 4], 
                      [5, 6]]) # Each list is a point
print(my_points)
plt.plot(my_points[:, 0], my_points[:,1])    # Slice to get x values separate from y values
plt.show()
plt.plot(my_points[:, 0], my_points[:, 1], 'ro') # 'r' is for red, 'o' is for circles
plt.show()
distances_millions_miles = [35, 67, 93, 142, 484, 889, 1790, 2880]
plt.plot(np.arange(1, 9), distances_millions_miles, 'o')
plt.show()
np.arange(1,9)
xpoints = np.linspace(0, 10, 100)
ypoints = xpoints ** 2 + 1
plt.plot(xpoints, ypoints)
plt.show()
plt.plot(my_points[:, 0], my_points[:, 1], 'ro')
myfit_x = np.linspace(1, 5, 100)
myfit_y = np.linspace(1.5, 5.5, 100) # Same y/x slope for all segments - so, a line
plt.plot(myfit_x,myfit_y)
plt.show()
import matplotlib.pyplot as plt
x = [1, 2, 3]
y1 = [1, 2, 3]
y2 = [3, 2, 1]
plt.plot(x, y1, label='Sales')
plt.plot(x, y2, label='Quality')
plt.legend()
plt.title('Trends')
plt.grid(True)
customers = ['Oliver', 'Sophia', 'Liam', 'Arielle', 'Noah']
total_purchases = [56, 73, 24, 48, 88]
plt.bar(customers, total_purchases)
plt.xlabel("Customer name", fontsize=14)
plt.ylabel("Total purchases", fontsize=14)
plt.title("Total purchases for 5 Amazon customers", fontsize=16)
plt.tick_params(axis='x', labelsize=14)
plt.tick_params(axis='y', labelsize=14)
plt.show()
# Lecture13BiggerPrograms.py
    """
    Compute f-measure for each item in a list.
    
    Argument: stats_list (list):  a list of tuples of four ints, (tp, fp, tn, fn)
               (these stand for true positive, false positive, etc)
    Returns:  a list of floats, the f-measures.
    """
    """
    Compute the f-measure, a performance measure that ignores true negatives.
    
    Arguments:  tp (int):  the count of true positives
                fp (int):  the count of false negatives
                tn (int):  the count of true negatives
                fn (int):  the count of false negatives
    Returns: a float, the f-measure.
    """
def f_measures(stats_list):
    """
    Compute f-measure for each item in a list.
    
    Argument: stats_list (list):  a list of tuples of four ints, (tp, fp, tn, fn)
               (these stand for true positive, false positive, etc)
    Returns:  a list of floats, the f-measures.
    """
    for tp, fp, tn, fn in stats_list:
        f = f_measure(tp, fp, tn, fn)
def f_measures(stats_list):
    """
    Compute f-measure for each item in a list.
    
    Argument: stats_list (list):  a list of tuples of four ints, (tp, fp, tn, fn)
               (these stand for true positive, false positive, etc)
    Returns:  a list of floats, the f-measures.
    """
    out = []
    for tp, fp, tn, fn in stats_list:
        f = f_measure(tp, fp, tn, fn)
        out.append(f)
    return f
def f_measure(tp, fp, tn, fn):
    """
    Compute the f-measure, a performance measure that ignores true negatives.
    
    Arguments:  tp (int):  the count of true positives
                fp (int):  the count of false negatives
                tn (int):  the count of true negatives
                fn (int):  the count of false negatives
    Returns: a float, the f-measure.
    """
    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    return (2 * precision * recall)/(precision + recall)
def f_measure(precision, recall):
    """
    Compute the f-measure, a performance measure that ignores true negatives.
    
    Arguments:  precision (float):  proportion of positive classifications that are correct
                recall (float):  proportion of positive examples that were found
    Returns: a float, the f-measure.
    """
    return (2 * precision * recall)/(precision + recall)
def precision(tp, fp):
    return tp/(tp + fp)
def recall(tp, fn):
    tp/(tp + fn)
    
def f_measures(stats_list):
    """
    Compute f-measure for each item in a list.
    
    Argument: stats_list (list):  a list of tuples of four ints, (tp, fp, tn, fn)
               (these stand for true positive, false positive, etc)
    Returns:  a list of floats, the f-measures.
    """
    out = []
    for tp, fp, tn, fn in stats_list:
        f = f_measure(precision(tp, fp), recall(tp, fn))
        out.append(f)
    return f
print(precision(4,4)) # Expect 0.5
print(recall(4,4)) # Expect 0.5
print(f_measure(1, 1)) # Expect 1
def recall(tp, fn):
    print(tp/(tp + fn))
    
recall(4,4)
def recall(tp, fn):
    print(tp/(tp + fn))
    return tp/(tp + fn)
    
recall(4,4)
def f_measure(precision, recall):
    """
    Compute the f-measure, a performance measure that ignores true negatives.
    
    Arguments:  precision (float):  proportion of positive classifications that are correct
                recall (float):  proportion of positive examples that were found
    Returns: a float, the f-measure.
    """
    return (2 * precision * recall)/(precision + recall)
def precision(tp, fp):
    return tp/(tp + fp)
def recall(tp, fn):
    return tp/(tp + fn)
    
def f_measures(stats_list):
    """
    Compute f-measure for each item in a list.
    
    Argument: stats_list (list):  a list of tuples of four ints, (tp, fp, tn, fn)
               (these stand for true positive, false positive, etc)
    Returns:  a list of floats, the f-measures.
    """
    out = []
    for tp, fp, tn, fn in stats_list:
        f = f_measure(precision(tp, fp), recall(tp, fn))
        out.append(f)
    return f
print(precision(4,4)) # Expect 0.5
print(recall(4,4)) # Expect 0.5
print(f_measure(1, 1)) # Expect 1
print(precision(0, 4)) # Expect 0
print(precision(0, 0)) # Expect ... oh, I guess we didn't think about this.  0?
print(precision(4, 0)) # Expect 1
print(recall(0, 4)) # Expect 0
print(recall(0, 0)) # Similarly to precision, let's return 0
print(recall(4, 0)) # Expect 1
print(f_measure(0, 0)) # Expect 0
print(f_measure(0.5, 0.5)) # Expect 0.5
def f_measure(precision, recall):
    """
    Compute the f-measure, a performance measure that ignores true negatives.
    
    Arguments:  precision (float):  proportion of positive classifications that are correct
                recall (float):  proportion of positive examples that were found
    Returns: a float, the f-measure.
    """
    return (2 * precision * recall)/(precision + recall)
def precision(tp, fp):
    if tp + fp == 0:
        return 0
    return tp/(tp + fp)
def recall(tp, fn):
    if tp + fn == 0:
        return 0
    return tp/(tp + fn)
    
def f_measures(stats_list):
    """
    Compute f-measure for each item in a list.
    
    Argument: stats_list (list):  a list of tuples of four ints, (tp, fp, tn, fn)
               (these stand for true positive, false positive, etc)
    Returns:  a list of floats, the f-measures.
    """
    out = []
    for tp, fp, tn, fn in stats_list:
        f = f_measure(precision(tp, fp), recall(tp, fn))
        out.append(f)
    return f
print(precision(4,4)) # Expect 0.5
print(recall(4,4)) # Expect 0.5
print(f_measure(1, 1)) # Expect 1
print(precision(0, 4)) # Expect 0
print(precision(0, 0)) # Expect 0
print(precision(4, 0)) # Expect 1
print(recall(0, 4)) # Expect 0
print(recall(0, 0)) # Similarly to precision, let's return 0
print(recall(4, 0)) # Expect 1
print(f_measure(0, 0)) # Expect 0
print(f_measure(0.5, 0.5)) # Expect 0.5
def f_measure(precision, recall):
    """
    Compute the f-measure, a performance measure that ignores true negatives.
    
    Arguments:  precision (float):  proportion of positive classifications that are correct
                recall (float):  proportion of positive examples that were found
    Returns: a float, the f-measure.
    """
    if precision + recall == 0:
        return 0
    return (2 * precision * recall)/(precision + recall)
def precision(tp, fp):
    if tp + fp == 0:
        return 0
    return tp/(tp + fp)
def recall(tp, fn):
    if tp + fn == 0:
        return 0
    return tp/(tp + fn)
    
def f_measures(stats_list):
    """
    Compute f-measure for each item in a list.
    
    Argument: stats_list (list):  a list of tuples of four ints, (tp, fp, tn, fn)
               (these stand for true positive, false positive, etc)
    Returns:  a list of floats, the f-measures.
    """
    out = []
    for tp, fp, tn, fn in stats_list:
        f = f_measure(precision(tp, fp), recall(tp, fn))
        out.append(f)
    return f
print(precision(4,4)) # Expect 0.5
print(recall(4,4)) # Expect 0.5
print(f_measure(1, 1)) # Expect 1
print(precision(0, 4)) # Expect 0
print(precision(0, 0)) # Expect 0
print(precision(4, 0)) # Expect 1
print(recall(0, 4)) # Expect 0
print(recall(0, 0)) # Similarly to precision, let's return 0
print(recall(4, 0)) # Expect 1
print(f_measure(0, 0)) # Expect 0
print(f_measure(0.5, 0.5)) # Expect 0.5
# Lecture14Pandas.py
import pandas as pd
import numpy as np
s1 = pd.Series([-3, -1, 1, 3, 5])
print(s1)
print(s1.index)
s1[:2] # First 2 elements
print(s1[[2,1,0]])  # Elements out of order
type(s1)
s1[s1 > 0]
s2 = pd.Series(np.random.rand(5), index=['a', 'b', 'c', 'd', 'e'])
print(s2)
print(s2.index)
print(s2['a'])
data = {'pi': 3.14159, 'e': 2.71828}  # dictionary
print(data)
s3 = pd.Series(data)
print(s3)
my_array = s3.values
print(my_array)
import numpy as np
my_data = np.array([[5, 5, 4], 
                    [2, 3, 4]])
hotels = pd.DataFrame(my_data, index = ["Alice rating", "Bob rating"],
                   columns = ["Hilton", "Marriott", "Four Seasons"])
hotels
from google.colab import files
uploaded = files.upload() # pick starbucks_drinkMenu_expanded.csv
get_ipython().system('ls')
import pandas as pd
df = pd.read_csv('starbucks_drinkMenu_expanded.csv', index_col = 'Beverage')
df.head()
sorted_df = df.sort_values(by = "Calories", ascending=False)
sorted_df.head()
hotels
hotels['Hilton']
sum = 0
for i in hotels['Hilton']:
    sum += i
print('Average Hilton Rating: ' + str(sum/len(hotels['Hilton'])))
hotels.loc['Bob rating']
hotels.loc['Bob rating', 'Marriott']
hotels.iloc[1, 1]
print(hotels.iloc[0, 1:2])
print(hotels.loc['Bob rating', ['Marriott', 'Hilton']])
(df['Calories'] > 300)
df[df['Calories'] > 300].head()
df[(df['Calories'] > 300) & (df['Beverage_prep'] == 'Soymilk')].head()
df['bad_fat'] = df['Trans_Fat_g'] + df['Saturated_Fat_g']
df.head()
size_ounces_dict = {'Short': 8, 'Tall': 12, 'Grande': 16, 'Venti': 20}
ounces_list = []
for drink in df['Beverage_prep']:
    ounces_list.append(size_ounces_dict.get(drink, -1))
df['ounces'] = ounces_list
df.head()
def size_to_ml(size_name):
    size_ounces_dict = {'Short': 8, 'Tall': 12, 'Grande': 16, 'Venti': 20}
    return size_ounces_dict.get(size_name,0) * 29.5735
ml = df['Beverage_prep'].map(size_to_ml)
print(ml)
# Lecture15Pandas.py
import pandas as pd
df = pd.read_csv('starbucks_drinkMenu_expanded.csv', index_col = 'Beverage')
df.head()
print(df.loc[:, "Protein_g"].mean())
print(df.loc[:, "Protein_g"].max())
print(df.loc[:, "Protein_g"].idxmax()) # "argmax," gives index with biggest value
df.describe()
df.corr(numeric_only=True)  # New to pandas 2.0.0: chokes on strings without added arg
df.columns
df.dtypes
string = 'string'
string[:-1]
df['Vitamin_A'] = df['Vitamin_A'].str[0:-1] # Remove the % at the end
df['Vitamin_A']
df['Vitamin_A'] = pd.to_numeric(df['Vitamin_A'])
df.dtypes
df['Vitamin_A'] = df['Vitamin_A'].astype('float64')
df.dtypes
df.corr(numeric_only=True)
df.isnull().sum()
df = df.dropna(axis=0, how="any") # Remove the offending row
df.isnull().sum()
calorie_max = 0
best_name = ""
for index, row in df.iterrows():
  if row['Calories'] > calorie_max:
    calorie_max = row['Calories']
    best_name = index
print(best_name)
protein = df.loc[:, "Protein_g"]
protein.hist(bins=20); # Create a histogram with 20 equally spaced bins for the data
subplot = df[["Protein_g", "Vitamin_A"]] # Notice another way to get desired columns
subplot.boxplot(); # Boxplots give median value, middle 50% of data, and range of non-outliers
from google.colab import files
uploaded = files.upload() # pick titanic.csv
df = pd.read_csv('titanic.csv', index_col = 'PassengerId')
df.head()
df.columns
df.dtypes
df.describe()
df.corr(numeric_only=True)
males = df[df['Sex'] == 'male']
males.head()
males.describe()
females = df[df['Sex'] == 'female']
females.describe()
df['sex_numeric'] = df['Sex'] == 'female'
df.corr(numeric_only=True)
third_class = df[df['Pclass'] == 3]
second_class = df[df['Pclass'] == 2]
first_class = df[df['Pclass'] == 1]
third_class['Survived'].hist();
second_class['Survived'].hist();
first_class['Survived'].hist();
# Lecture16Strings.py
my_cost = 12.95821
print(f'The total cost was {my_cost} dollars')
print(f'The total cost was {my_cost:.2f} dollars')
groceries = "milk,eggs,yogurt"
grocerieslist = groceries.split(',')
print(grocerieslist)
','.join(['milk', 'eggs', 'yogurt'])
'     milk,eggs,yogurt     '.strip()
lines = "SERVANT: Sir, there are ten thousand--\nMACBETH: Geese, villain?"
linelist = lines.splitlines()  # A shortcut for split('\n')
for line in linelist:
  if line.startswith("MACBETH"):
    print(line.split(": ")[1])
print('Wow\n\twow!')
print("foo" in "food")
print("foodfood".replace("foo", "ra"))
import numpy as np
import pandas as pd
my_data = np.array([["Excellent", "   Okay   ", "   Okay"], ["Great    ", "   Good", "   Good"]])
df = pd.DataFrame(my_data, columns = ["Hilton", "Marriott", "Four Seasons"], index = ["Alice", "Bob"])
df
marriott = df['Marriott']
for s in marriott:
    print(s)
print('---')
for s in marriott.str.strip():
    print(s)  # Look, no extra whitespace
marriott.str.match("\s*Okay\s*")
import re
pattern = '02143'
longstring = 'Somerville, MA 02143'
result = re.search(pattern, longstring)
if result:  # (if it's not None)
    print(result.group())
longstring = '0132428190214200'
pattern2 = '02143'
result2 = re.search(pattern2, longstring)
print(result2)
pattern3 = '\d\d\d\d\d'
longstring = 'Somerville, MA 02143'
result3 = re.search(pattern3, longstring)
if result3:
    print(result3.group())
longstring = 'My phone number is 5555555'
pattern4 = 'phone number is \d+'
result4 = re.search(pattern4, longstring)
if result4:
    print(result4.group())
longstring = 'Call me at 555-5555'
pattern5 = '\d\d\d-?\d\d\d\d'
result5 = re.search(pattern5, longstring)
if result5:
    print(result5.group())
longstring = "Call me at 1-800-555-5555."
pattern = "(\d-)?(\d\d\d-)?\d\d\d-?\d\d\d\d"
result = re.search(pattern, longstring)
if result:
    print(result.group())
longstring2 = "Call me at 555-5555."
result = re.search(pattern, longstring2)
if result:
    print(result.group())
pattern = "Somerville, (MA|NJ)"
longstring = "Somerville, NJ 02143"
result = re.search(pattern, longstring)
if result:
    print(result.group())
longstring = "States with a Somerville:  AL, IN, ME, MA, NJ, OH, TN, TX"
pattern = "[A-Z][A-Z]"  # Get capital letters within A-Z range
result = re.findall(pattern, longstring)
print(result)
longstring = "The stock NVDA went down 4.54 points"
pattern = "stock (\w+) went down (\d+\.\d+) points"
result = re.search(pattern, longstring)
if result:
    print(result.group())
    print(result.group(1))  # Subgroup 1, the first () in the pattern
    print(result.group(2))
import re
longstring = "We paid $100 for those shoes"
pattern = '\$\d+'
result = re.search(pattern, longstring)
print(result.group())
# Lecture18Objects.py
class Car:
    pass
car1 = Car()
car2 = Car()
car3 = Car()
print(isinstance(car1,Car))
car1.year = 2010
car1.make = "Honda"
car1.model = "Fit"
car1.color = "blue"
car2.year = 2013
car2.make = "Toyota"
car2.model = "Camry"
car2.color = "silver"
print(f"This car is a {car1.year} {car1.color} {car1.make} {car1.model}")
my_car = (2010, 'Honda', 'Fit', 'blue')
print(f"This car is a {my_car[0]} {my_car[3]} {my_car[1]} {my_car[2]}")
class Car:
    def print_facts(self):
        print(f"This car is a {self.year} {self.color} {self.make} {self.model}")
car1 = Car()
car2 = Car()        
car1.year = 2010
car1.make = "Honda"
car1.model = "Fit"
car1.color = "blue"
car2.year = 2013
car2.make = "Toyota"
car2.model = "Camry"
car2.color = "silver"
car1.print_facts()
car2.print_facts()
class Car:
    def __init__(self, year, make, model, color):
        # It's common for the constructor's arguments
        # to have similar or identical names to the attributes they set
        # (but we still have to say one should be set to the other)
        self.year = year
        self.make = make
        self.model = model
        self.color = color
    
    def print_facts(self):
        print(f"This car is a {self.year} {self.color} {self.make} {self.model}")
car1 = Car(2010, "Honda", "Fit", "blue")
car2 = Car(2013, "Toyota", "Camry", "silver")        
car1.print_facts()
car2.print_facts()
def newest_car(list_of_cars):
    if not list_of_cars:  # ie, empty list
        return None
    best_year = list_of_cars[0].year
    best_car = list_of_cars[0]
    for car in list_of_cars:
        # This warning message could prevent a bug if we try
        # to hand this function the wrong list
        if not isinstance(car, Car):
            print('Warning, list had non-car items!')
        elif car.year > best_year:
            best_year = car.year
            best_car = car
    return best_car
newest_car([car1, car2]).print_facts()
class Bill:
  """ Represents a bill at a restaurant.
  _items (list of tuples):  list of (item name, cost) tuples
  """
  def __init__(self, items):
    self._items = items
  # "Getter"
  def items(self):
    return self._items
  # "Setter"
  def set_items(self, items):
    self._items = items
  
  def total_cost_pretax(self):
    total = 0
    for name, cost in self._items:
      total += cost
    return total
  def total_cost_with_tax(self, tax_rate):
    return round(self.total_cost_pretax() * (1 + tax_rate), 2)
my_lunch = [("Ham Sandwich", 9), ("Coke", 2)]
new_bill = Bill(my_lunch)
cost_with_tax = new_bill.total_cost_with_tax(0.08)
print(f"Total cost: {cost_with_tax}")
new_bill.items() # could have said new_bill._items, but we were told not to
class Bill:
  """ Represents a bill at a restaurant. 
  _item_names (list of strings):  list of items on bill
  _item_costs (list of ints): list of prices of items on bill
  _items is not here anymore! sorry anybody who wrote code that uses it, we warned you!
  """
  def __init__(self, items):
    self._item_names = [item[0] for item in items]
    self._item_costs = [item[1] for item in items]
  # "Getter"
  def items(self):
    # list(zip(a, b)) returns a list of tuples combining a and b
    return list(zip(self._item_names, self._item_costs))
  # "Setter"
  def set_items(self, items):
    self._item_names = [item[0] for item in items]
    self._item_costs = [item[1] for item in items]
  
  def total_cost_pretax(self):
    total = 0
    for name, cost in self._items:
      total += cost
    return total
  # Notice that we can call another method with this one
  def total_cost_with_tax(self, tax_rate):
    return round(self.total_cost_pretax() * (1 + tax_rate), 2)
my_lunch = [("Ham Sandwich", 9), ("Coke", 2)]
new_bill = Bill(my_lunch)
print(new_bill.items())  # this still works, but _items would have broken
class Circle:
  def __init__(self, radius):
    if radius < 0:
      raise ValueError("Can't have negative circle radius")
    self.radius=radius
Circle(-1)
class Circle2:
  def __init__(self,radius=2):
    self.radius = radius
Circle2().radius
class Student:
  def __init__(self, age, major, year):
    self.age = age
    self.major = major
    self.year = year
  
  def get_older(self, amount):
    self.age += amount
bob = Student(20,"Biology","Sophomore")
bob.get_older(2)
print(bob.age)
car1 = Car(2010, "Honda", "Fit", "blue")
car2 = car1
car2.color = "black"
car1.print_facts()  # It's black now
car2.print_facts()
import copy
car2 = copy.copy(car1)
car2.color = "white"
car1.print_facts()
car2.print_facts()
from google.colab import files
uploaded = files.upload() # import books.csv
import pandas as pd
df = pd.read_csv('books.csv', index_col = 'title')
df.head()
class Book:
    def __init__(self, title, author, average_rating):
        self.title = title
        self.author = author
        self.average_rating = average_rating
        # Could add more fields from the dataset if desired
    
class Publisher:
    def __init__(self, df, publisher_name):
        self.name = publisher_name
        self.books = []
        for row in df.itertuples():
            if row.publisher == publisher_name:
                self.books.append(Book(row.Index, row.authors, row.average_rating))
    
    def average_rating(self):
        total = 0
        for book in self.books:
            total += book.average_rating
        return total/len(self.books)
scholastic = Publisher(df,'Scholastic Inc.')
scholastic.average_rating()
# Lecture19MoreOO.py
class Client:  # both Faculty and Students
  def __init__(self, birthyear, uid):
    self.birthyear = birthyear
    self.uid = uid
  def get_uid(self):
    return self.uid
  
  def get_birthyear(self):
    return self.birthyear
class Student(Client):  # inherit from Client
  def __init__(self, birthyear, uid, gradyear):
    self.birthyear = birthyear
    self.uid = uid
    self.gradyear = gradyear
  def get_gradyear(self):
    return self.gradyear
    
class Faculty(Client):
  pass     # Nothing else we want to do for Faculty
   
alice = Student(2003, 123456789, 2024)
print(alice.get_birthyear()) # Inherited from Client
print(alice.get_uid())       # Inherited from Client
print(alice.get_gradyear())  # Specific to Student
person1 = Student(2000,123456,2025)
if not isinstance(person1, Faculty):
    print("Hey, this person doesn't have permission to do this!")
else:
    print("Welcome, Faculty number " + str(person1.uid) + "!")
student1 = Student(2000,123456,2025)
print(isinstance(student1,Student))
print(isinstance(student1,Client))
print(isinstance(student1,object)) # Every class inherits from object
class Student(Client):  # inherit from Client
  def __init__(self, birthyear, uid, gradyear):
    super().__init__(birthyear, uid)
    self.gradyear = gradyear
  def get_gradyear():
    return self.gradyear
bob = Student(2002,987654321,2022)
print(bob.get_uid()) # inherited from Client
class Trip:
  def __init__(self,cost,start_date,end_date):
    self.cost = cost
    self.start_date = start_date
    self.end_date = end_date
    self.reimbursed = False
  def cost(self):
    return self.cost
  
  def reimburse(self):
    self.reimbursed = True
  
  def dates(self):
    return self.startDate, self.endDate
class EquipmentOrder:
  def __init__(self,cost,domestic_seller):
    self.cost = cost
    self.reimbursed = False
    self.domestic_seller = domestic_seller
  def cost(self):
    return self.cost
  
  def reimburse(self):
    self.reimbursed = True
  
  def domestic_seller(self):
    return self.domestic_seller
class Expense:
  def __init__(self,cost):
    self.cost = cost
    self.reimbursed = False
  
  def cost(self):
    return self.cost
  
  def reimburse(self):
    self.reimbursed = True
class Trip(Expense):
  def __init__(self,cost,start_date,end_date):
    super().__init__(cost)
    self.start_date = start_date
    self.end_date = end_date
  
  # inherit cost, reimburse
  def dates(self):
    return self.start_date, self.end_date
class EquipmentOrder(Expense):
  def __init__(self,cost,domestic_seller):
    super().__init__(cost)
    self.domestic_seller = domestic_seller
  # inherit cost, reimburse
  def domestic_seller(self):
    return self.domestic_seller
class Employee:
    def __init__(self, name, salary, title, years_of_service):
        self.name = name
        self.salary = salary
        self.title = title
        self.years_of_service = years_of_service
    
    def give_raise(self, raise_amount):
        self.salary += raise_amount
        
    def change_title(self, new_title):
        self.title = new_title
    
    def update_years_of_service(self, increase):
        self.years_of_service += increase
class Contractor:
    def __init__(self, name, salary, contract_duration):
        self.name = name
        self.salary = salary
        self.contract_duration = contract_duration
    
    def give_raise(self, raise_amount):
        self.salary += raise_amount
    
alice = Employee("Alice", 90000, "Manager", 7)
alice.give_raise(10000)
print(alice.salary)
bob = Contractor("Bob", 80000, 2)
bob.give_raise(10000)
print(bob.salary)
class Worker:
    def __init__(self, name, salary):
        self.name = name
        self.salary = salary
        
    def give_raise(self, raise_amount):
        self.salary += raise_amount
    
class Employee(Worker):
    def __init__(self, name, salary, title, years_of_service):
        super().__init__(name, salary)
        self.title = title
        self.years_of_service = years_of_service
        
    def change_title(self, new_title):
        self.title = new_title
    
    def update_years_of_service(self, increase):
        self.years_of_service += increase
class Contractor(Worker):
    def __init__(self, name, salary, contract_duration):
        super().__init__(name, salary)
        self.contract_duration = contract_duration
        
alice = Employee("Alice", 90000, "Manager", 7)
alice.give_raise(10000)
print(alice.salary)
bob = Contractor("Bob", 80000, 2)
bob.give_raise(10000)
print(bob.salary)
class Gradyear:
  def __init__(self, year):
    self.year = year
year = Gradyear(2024)
print(year)
class Gradyear:
  def __init__(self, year):
    self.year = year
  def __str__(self):    # Our own implementation
    return str(self.year)
gradyear = Gradyear(2024)
print(gradyear)
gy1 = Gradyear(2024)
gy2 = Gradyear(2024)
print(gy1 == gy2)
myset = set()
myset.add(gy1)
myset.add(gy2)
len(myset)
class Gradyear:
  def __init__(self, year):
    self.year = year
  def __str__(self):    # Our own implementation
    return str(self.year)
  
  def __eq__(self, other):
    return self.year == other.year
  def __hash__(self):
    return self.year # Just store by number itself
gy1 = Gradyear(2024)
gy2 = Gradyear(2024)
print(gy1 == gy2)
myset = set()
myset.add(gy1)
myset.add(gy2)
len(myset)
# Lecture20Recursion.py
def bad_recursion():
  print("Bad!")
  bad_recursion()
bad_recursion()
def factorial(n):
  # Omitting checks to make sure we're a natural number, etc
  if n == 1:
    return 1
  return n * factorial(n-1)
print (factorial(4))
def factorial(n):
  # Omitting checks to make sure we're a natural number, etc
  print(f'Evaluating {n}!')
  if n == 1:
    print('Returning 1')
    return 1
  result = n * factorial(n-1)
  print(f'Returning {result}')
  return result
print (factorial(4))
def sum_m_to_n(m, n):
    if n == m:
        return m
    result = n + sum_m_to_n(m, n-1)
    return result
sum_m_to_n(3, 7) # 3 + 4 + 5 + 6 + 7 = 25
def sum_m_to_n(m, n):
    print(f'Evaluating sum from {m} to {n}')
    if n == m:
        print(f'Returning {m}')
        return m
    result = n + sum_m_to_n(m, n-1)
    print(f'Returning {result}')
    return result
sum_m_to_n(3, 7) # 3 + 4 + 5 + 6 + 7 = 25
def mypow(a, p):
    if p == 0:
        return 1
    result = a * mypow(a, p-1)
    return result
mypow(2,8)
def mypow(a, p):
    print(f'Evaluating {a}^{p}')
    if p == 0:
        print('Returning 1')
        return 1
    result = a * mypow(a, p-1)
    print(f'Returning {result}')
    return result
mypow(2,8)
def fib(n):
    if (n == 0):
        return 0
    if (n == 1):
        return 1
    return fib(n-1) + fib(n-2)
for i in range(10):
    print(fib(i))
def r_perm(r, n):
    if n == r+1:
        return n
    return n * r_perm(r,n-1)
r_perm(5,7)
def iter_factorial(n):
  running_fact = 1
  for i in range(1,n+1):
    running_fact *= i
  return running_fact
  
print(iter_factorial(4))
import numpy as np
def iter_fib(n):
    if n == 0 or n == 1:
        return n
    fibs = np.zeros(n+1)
    fibs[0] = 0
    fibs[1] = 1
    for i in range(2,n+1):
        fibs[i] = fibs[i-1] + fibs[i-2]
    return int(fibs[n])
for i in range(10):
    print(iter_fib(i))
def power_set(setstring):
    if len(setstring) == 0:
        return [""]
    subset_list = []
    # Recursive call gets all the subsets that don't involve the first character
    smaller_power_set = power_set(setstring[1:])
    # The starting character is either in the subset...
    for substring in smaller_power_set:
        subset_list.append(setstring[0] + substring)
    # ...or not.
    for substring in smaller_power_set:
        subset_list.append(substring)
    return subset_list
power_set("abcd")
def recursive_sum(lst):
    if not lst:  # empty list
        return 0
    return lst[0] + recursive_sum(lst[1:])
recursive_sum([1,2,3])
def recursive_filter(min_val, lst):
    if not lst:
        return []
    if lst[0] >= min_val:
        return [lst[0]] + recursive_filter(min_val, lst[1:])
    else:
        return recursive_filter(min_val, lst[1:])
recursive_filter(3, [1, 2, 3, 4, 5])
def recursive_index(item, lst, index):  # index tracks where we are in the list
    if not lst:
        return None   # not found
    if lst[0] == item:
        return index
    return recursive_index(item,lst[1:],index+1)
recursive_index(5, [0, 1, 2, 5], 0)
def recursive_skiplist(lst):
    if len(lst) == 0:
        return []
    if len(lst) == 1:
        return lst
    return [lst[0]] + recursive_skiplist(lst[2:])
recursive_skiplist([5,3,7,2,9])
# Lecture21DataStructures.py
class ll_node:
  def __init__(self, num):
    self.number = num
    self.next = None
  def append(self, num):
    if self.next == None:     # End of the list - add the node
      self.next = ll_node(num)
    else:
      self.next.append(num) # Recursively append to rest of list
    
  def contains(self, othernum):
    if self.number == othernum:  # We found it
      return True
    elif self.next == None:  # We reached the end, didn't find it
      return False
    # Not here, there's more list - so, keep looking (recursively)
    return self.next.contains(othernum)
  def __str__(self):
    if self.next == None:  # Last number
        return str(self.number)
    # Print this and print the rest (more recursion)
    return str(self.number) + ' ' + str(self.next)
mylist = ll_node(6)
mylist.append(1)
mylist.append(7)
print(mylist)
print('Contains 7: ' + str(mylist.contains(7)))
print('Contains 5: ' + str(mylist.contains(5)))
import numpy as np
class dynamic_array:  # Showing how Python lists work
  def __init__(self, initial_size):
    self.memory = np.zeros(initial_size)
    self.occupied = 0
    self.size = initial_size
  def __str__(self):
    return str(self.memory)
  
  def append(self, val):
    if self.occupied == self.size:
      print('Resizing...')
      new_memory = np.zeros(self.size*2)
      # A "hiccup" in running time as everything's copied
      for i in range(len(self.memory)): 
        new_memory[i] = self.memory[i] 
      self.memory = new_memory
      self.size = self.size*2
    print('Adding ' + str(val))
    self.memory[self.occupied] = val
    self.occupied += 1
my_array = dynamic_array(2)
print(my_array)
my_array.append(1)
my_array.append(1)
print(my_array)
my_array.append(1)
print(my_array)
my_array.append(1)
print(my_array)
class FolderTree:
  # binary left and right are its fields
  def __init__(self, val):
    self.left = None
    self.right = None
    self.val = val
  
  def addLeft(self, node):
    self.left = node
  
  def addRight(self, node):
    self.right = node
  
  def find(self, v):
    if self.val == v:
      return True
    # "if self.left" is checking that self.left exists - 
    # else error when we run self.left.find()
    if self.left and self.left.find(v):
      return True
    if self.right and self.right.find(v):
      return True
    return False
leftleftchild = FolderTree("wow.exe")
leftrightchild = FolderTree("xls.exe")
rightleftchild = FolderTree("lec12.pdf")
rightrightchild = FolderTree("lec14.pdf")
leftparent = FolderTree("apps")
rightparent = FolderTree("lecs")
leftparent.addLeft(leftleftchild)
leftparent.addRight(leftrightchild)
rightparent.addLeft(rightleftchild)
rightparent.addRight(rightrightchild)
root = FolderTree("root")
root.addLeft(leftparent)
root.addRight(rightparent)
print(root.find("wow.exe"))
print(root.find("lec13.exe"))
def count_nodes(tree):
    if tree == None:
        return 0
    return 1 + count_nodes(tree.left) + count_nodes(tree.right)
count_nodes(root)
def calc_depth(tree):
    if tree is None:
        return 0
    if tree.left is None and tree.right is None:
        return 0  # Leaf has depth 0 in its subtree
    return 1 + max(calc_depth(tree.left), calc_depth(tree.right))
calc_depth(root)
class BinarySearchTree:
  # binary left and right are its fields
  def __init__(self, val):
    self.left = None
    self.right = None
    self.val = val
  
  def addLeft(self, node):
    self.left = node
  
  def addRight(self, node):
    self.right = node
  
  def find(self, v):
    if self.val == v:
      return True
    if v < self.val:
      if self.left:
        print("Going Left")
        return self.left.find(v)
      else:
        return False
    else:
      if self.right:
        print("Going Right")
        return self.right.find(v)
      else:
        return False
root = BinarySearchTree("m")
leftparent = BinarySearchTree("f")
rightparent = BinarySearchTree("q")
leftleftchild = BinarySearchTree("a")
leftrightchild = BinarySearchTree("h")
rightleftchild = BinarySearchTree("o")
rightrightchild = BinarySearchTree("u")
leftparent.addLeft(leftleftchild)
leftparent.addRight(leftrightchild)
rightparent.addLeft(rightleftchild)
rightparent.addRight(rightrightchild)
root.addLeft(leftparent)
root.addRight(rightparent)
print(root.find("h"))
print(root.find("d"))
class infect_tree:
    # name is a string, infects is a list of infect_tree's infected
    def __init__(self, name, infects):
        self.name = name
        self.infects = infects
jake = infect_tree('jake', [])
eric = infect_tree('eric', [])
fifi = infect_tree('fifi', [])
ged = infect_tree('ged', [])
hao = infect_tree('hao', [])
idris = infect_tree('idris', [jake])
bob = infect_tree('bob', [eric])
che = infect_tree('che', [])
daphne = infect_tree('daphne', [fifi, ged, hao, idris])
alice = infect_tree('alice', [bob, che, daphne])
def find_most_infections(my_tree):
    best_infects = len(my_tree.infects)
    best_name = my_tree.name
    for infect in my_tree.infects:
        name, infects = find_most_infections(infect) # Recursion...
        if infects > best_infects:
            best_infects = infects
            best_name = name
    return best_name, best_infects
find_most_infections(alice)
def find_all_descendants(my_tree):
    my_list = [my_tree.name]
    for infect in my_tree.infects:
        my_list += find_all_descendants(infect)  # More recursion
    return my_list
find_all_descendants(daphne)
# Lecture22ScikitLearn.py
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
digits = load_digits()
print(digits.data.shape) # Examples x 64 pixels
import matplotlib.pyplot as plt 
plt.gray() 
plt.matshow(digits.images[0]) # Notice images[0] is 2D
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.neighbors import KNeighborsClassifier
nbrs = KNeighborsClassifier(n_neighbors=3).fit(digits.data, digits.target)
nbrs.score(digits.data, digits.target) # Find accuracy on the training dataset
from sklearn.model_selection import train_test_split
data_train, data_test, label_train, label_test = train_test_split(digits.data, digits.target, test_size=0.2)
nbrs = KNeighborsClassifier(n_neighbors=3).fit(data_train, label_train)
nbrs.score(data_test,label_test)
print(nbrs.predict(data_test[0:3]))
def reshape_and_show(num, data_test):
    image = data_test[num].reshape(8,8)
    plt.matshow(image)
reshape_and_show(0,data_test)
reshape_and_show(1,data_test)
reshape_and_show(2,data_test)
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person = 100)
plt.imshow(faces.images[5], cmap="gray")
data_train, data_test, label_train, label_test = train_test_split(faces.data, faces.target, test_size=0.2)
nbrs = KNeighborsClassifier(n_neighbors=3).fit(data_train, label_train)
 
nbrs.score(data_test,label_test)
import random
random.seed(110)  # Set seed - comment this out to get different rolls
print(random.randint(1,8))  # Normally produces random integer 1-8
print(random.randint(1,8))
data_train, data_test, label_train, label_test = train_test_split(faces.data,
                                                                  faces.target, test_size=0.2,
                                                                  random_state=110) # Set the seed
nbrs = KNeighborsClassifier(n_neighbors=3).fit(data_train, label_train)
 
nbrs.score(data_test,label_test)
from sklearn.model_selection import cross_val_score
cross_val_score(nbrs, data_train, label_train)
import numpy as np
for i in range(1,10):
  nbrs = KNeighborsClassifier(n_neighbors=i)
  print(np.mean(cross_val_score(nbrs, data_train, label_train)))
# Lecture23DecisionTrees.py
import math
yes_branch_entropy = 0
no_branch_entropy = -0.2 * math.log(0.2,2) - 0.8 * math.log(0.8, 2)
pr_yes = 5/2005
pr_no = 2000/2005
print(pr_yes * yes_branch_entropy + pr_no * no_branch_entropy)
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
iris = load_iris()
iris.feature_names
iris.target_names
iris.data[0]
features_train, features_test, labels_train, labels_test = \
train_test_split(iris.data, iris.target, test_size=0.1, random_state=110)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
dtree = DecisionTreeClassifier(criterion="entropy", random_state=110)
dtree.fit(features_train, labels_train)
dtree.score(features_test, labels_test) # Gives accuracy
import matplotlib.pyplot as plt
from sklearn import tree
plt.figure(figsize=(14,10))
tree.plot_tree(dtree, feature_names = iris.feature_names, class_names = iris.target_names)
# Lecture24RandomForestsOnly.py
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
iris = load_iris()
iris["feature_names"]
features_train, features_test, labels_train, labels_test = \
  train_test_split(iris['data'], iris['target'],
                   test_size=0.1,random_state=110)
irisforest = RandomForestClassifier(n_estimators=200,criterion="entropy",random_state=110)
irisforest.fit(features_train, labels_train)
irisforest.score(features_test, labels_test)
irisforest.feature_importances_
# Lecture25Regression.py
import numpy as np
x = np.linspace(1984, 2016, 33)
y = [48.0, 47.3, 47.2, 47.4, 47.2, 46.7,
     49.7, 49.6, 46.4, 47.3, 47.7, 47.8, 47.3, 47.4, 50.4, 49.8, 
     47.5, 49.1, 49.4, 47.1, 47.6, 48.4, 50.1, 48.3, 48.6, 47.8,
     50.4, 49.7, 51.4, 48.8, 47.7, 48.5, 50.3]
import matplotlib.pyplot as plt
plt.plot(x,y,'o')
import sklearn.linear_model as lm
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
x = x.reshape(-1,1)
linear_model.fit(x,y)
y_hat = linear_model.predict(x)
plt.plot(x,y,'o')
plt.plot(x,y_hat,'r')
print(f'The temperature is rising {linear_model.coef_[0]:.4f} degrees F per year')
print(f'{linear_model.intercept_:.2f}')
linear_model.score(x,y)
methane = np.array([12.81, 25.15, 38.06, 49.47, 60.24, 71.32,
     80.08, 94.14, 96.49, 100.32, 107.54, 111.50, 113.97, 120.26, 132.39, 134.82,
     133.30, 132.60, 135.91, 140.65, 135.76, 136.14, 138.11, 145.90, 152.41, 157.13,
     162.33, 167.15, 172.17, 177.86, 190.62, 200.65, 207.73])
mass_co = [84, 82.7, 84.9, 81.7, 81.9, 79.2, 79.9, 85.9, 84.3, 81.9,
           82.9, 82.8,83.7, 85, 83.6, 85, 77.1, 80.4, 77.2, 70.6,
           72.0, 68.1, 61.9, 65.7, 63.8, 65.6, 63.9]
y_from_90 = y[6:]  # From the last example, these are the temperatures
methane_from_90 = methane[6:]
x = np.transpose(np.array([mass_co, methane_from_90]))
x
temp_model = LinearRegression()
temp_model.fit(x,y_from_90)
print(temp_model.coef_)
print(temp_model.intercept_)
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import matplotlib.pyplot as plt
model = DecisionTreeRegressor() # no pruning of any kind, so expect overfitting
x = np.linspace(1984, 2016, 33)
x = x.reshape(-1,1)
y = [48.0, 47.3, 47.2, 47.4, 47.2, 46.7,
     49.7, 49.6, 46.4, 47.3, 47.7, 47.8, 47.3, 47.4, 50.4, 49.8, 
     47.5, 49.1, 49.4, 47.1, 47.6, 48.4, 50.1, 48.3, 48.6, 47.8,
     50.4, 49.7, 51.4, 48.8, 47.7, 48.5, 50.3]
xtrain = x[:30]
ytrain = y[:30]
model.fit(xtrain,ytrain)
yhat = model.predict(x)
plt.plot(x,y,'o')
plt.plot(x[:30],yhat[:30])
plt.plot(x[29:],yhat[29:],'r') # Plot line to test predictions in red
model = DecisionTreeRegressor(max_depth = 3) # maybe overdoing it on the pruning
x = np.linspace(1984, 2016, 33)
prev_value_features = [0] + y.copy()[:-1] # shift y values so we see the previous one; discard last
combined_features = np.array([x, prev_value_features]).transpose()
print(combined_features)
xtrain = combined_features[:30,:]
model.fit(xtrain,ytrain)
yhat = model.predict(combined_features)
plt.plot(x,y,'o')
plt.plot(x[:30],yhat[:30])
plt.plot(x[29:],yhat[29:],'r')
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(xtrain,ytrain) # xtrain has the matrix we made in the previous code box
yhat = model.predict(combined_features)
plt.plot(x,y,'o')
plt.plot(x[:30],yhat[:30])
plt.plot(x[29:],yhat[29:],'r')
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=3)
model.fit(xtrain,ytrain) # xtrain has the matrix we made in the previous code box
yhat = model.predict(combined_features)
plt.plot(x,y,'o')
plt.plot(x[:30],yhat[:30])
plt.plot(x[29:],yhat[29:],'r')
# Lecture26ModernNLPandML.py
import pandas as pd
SST2_LOC = 'https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv'
df = pd.read_csv(SST2_LOC, delimiter='\t', header=None)
df
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt') # Name means 'period' in German; from Kiss and Strunk 2006
word_tokenize("I won't sell my cat for even $1,000,000,000.") 
def wordset(raw_text):
  tokenized = word_tokenize(raw_text.lower())
  return set(tokenized)
def all_words_set(df_column):
  set_of_all = set()
  dict_of_all = {}
  for row in df_column:
    textset = wordset(row)
    set_of_all = set_of_all.union(textset)
    dict_of_all[row] = textset
  return set_of_all, dict_of_all
def one_hot_columns(df_column):
  all_words, all_tokenizations = all_words_set(df_column)
  word_dict = {}
  for word in all_words:
    word_present_list = []
    for line_num in range(len(df_column)):
      if word in all_tokenizations[df_column[line_num]]:
        word_present_list.append(1)
      else:
        word_present_list.append(0)
    word_dict[word] = word_present_list
  # We can create a dataframe from a dictionary of column header
  # to list of column values
  return pd.DataFrame.from_dict(word_dict)
one_hot_cols = one_hot_columns(df.iloc[:,0])
one_hot_cols
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
labels = df[1]
features = one_hot_cols
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=42)
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
one_hot_cols.sum()
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')
wv['king']
print(wv.most_similar('king')) # Prints words and cosines of angles with 'king'
import numpy as np
def find_cosine(vec1, vec2):
  # Scale vectors to both have unit length
  unit_vec1 = vec1/np.linalg.norm(vec1)
  unit_vec2 = vec2/np.linalg.norm(vec2)
  # The dot product of unit vectors gives the cosine of their angle
  return np.dot(unit_vec1,unit_vec2)
print(find_cosine(wv['king'], wv['faucet']))
wv.similarity('king', 'faucet')
def find_avg_vector(txt, embedding):
  words = word_tokenize(txt)
  vec_sum = None
  count = 0
  for word in words:
    if word in embedding:
      count += 1
      if vec_sum is not None:
        vec_sum += embedding[word]
      else:
        # The embeddings are read-only unless you copy them
        vec_sum = embedding[word].copy()
  if vec_sum is None:
    return pd.Series(np.zeros((300,)))  # Treat no word found in embedding as zero vector
  return pd.Series(vec_sum/count)
find_avg_vector('Long live the king and queen!', wv)
df_embeddings = df[0].apply(lambda txt: find_avg_vector(txt, wv))
df_embeddings.rename(columns=lambda x: 'feature'+str(x), inplace=True)
df_augmented = pd.concat([df, df_embeddings], axis=1)
df_augmented
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
labels = df_augmented[1]
features = df_augmented.iloc[:,2:]
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=42)
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)