from src.simulation import *
from src.tests import *

print(f"Number of memes in the dataset: {memes.shape[0]}")
memes.head()

Number of memes in the dataset: 6993

# this cell may take some time to run the first time
show_meme(memes.iloc[5000].name)

nb_users = 5
# Generate users
init_users = generate_users(nb_users)
# Display 5 lines
init_users.head()

# select a random item in the memes dataframe
meme = memes.sample(1) # returns a dataframe even if there is only 1 row in it!

# display the meme
show_meme(meme.iloc[0].name)

def selector_random(items, user, k):
    """
    :param items: dataframe with all items from which to select
    :param user: user for whom to select the items
    :param k: number of items to select
    return: (DataFrame) a slate of k items drawn randomly from the dataset of items, regardless of the user.
    """
    ### YOUR CODE HERE
    slate = items.sample(k) # SOLUTION
    ###
    
    return slate

# tests to check if your function is correct
test(selector_random)

🆗 Tests passed ! =)

selector_random(memes, init_users.iloc[0], 3)

init_users = generate_users(30)
cols = ["humour", "sarcasm"]  # you can modify the columns to plot in other axis (2 columns necessary)
simulate_and_render(init_users, memes, selector_random, col=cols, nb_steps=30, custom=custom_items_plotting(memes, cols, nb_items = 100))

# Let's generate some fake dataframes
first_df = pd.DataFrame([['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']], index=[0, 1, 2])
second_df = pd.DataFrame([['x', 'y', 'z']], index=[3])
third_df = pd.DataFrame([[10, 10, 10]], index=[4])

# And look at the result of the concatenation
concatenation = pd.concat([first_df, second_df, third_df]) # WATCH OUT, concat takes a **list** of dataframes i.e. the [ and ] are needed
display(concatenation)

print("first row:")
print(concatenation.iloc[0])

print("\nfirst two rows:")
print(concatenation.iloc[:2])

print("\nrows with positions [1;3[:")
print(concatenation.iloc[1:3])

first row:
0    a
1    b
2    c
Name: 0, dtype: object

first two rows:
   0  1  2
0  a  b  c
1  d  e  f

rows with positions [1;3[:
   0  1  2
1  d  e  f
2  g  h  i

# select one row, we get a Series
first_row = concatenation.iloc[0]
print(type(first_row))
print(first_row, "\n")

# create a Dataframe with the Series
# /!\ If we don't pass the Series in a list (i.e. between []), it will be a column instead of a row
df_first_row = pd.DataFrame([first_row])
print(type(df_first_row))
print(df_first_row)

<class 'pandas.core.series.Series'>
0    a
1    b
2    c
Name: 0, dtype: object 

<class 'pandas.core.frame.DataFrame'>
   0  1  2
0  a  b  c

item_to_promote = memes.iloc[0]
print(type(item_to_promote))
print(item_to_promote)

<class 'pandas.core.series.Series'>
humour                        0.0
sarcasm                      -1.0
offensive                    -1.0
motivational                    1
overall_sentiment             1.0
author               Celebrations
Name: image_0.png, dtype: object

def selector_advertisement(items, user, k):
    """
    :param items: dataframe with all items from which to select
    :param user: user for whom to select the items
    :param k: number of items to select
    return: a slate consisting of the selected item together with k-1 random items. Order matters!
    """
    ### YOUR CODE HERE
    # first let's draw k-1 random items
    random_items = items.sample(k-1) #SOLUTION
    
    # first dataframe: has to be one of the random items, will be the first row in the result (has to be in a dataframe!)
    first_df = pd.DataFrame([random_items.iloc[0]]) # SOLUTION
    
    # second dataframe: include the item to promote (has to be in a dataframe!)
    second_df = pd.DataFrame([item_to_promote]) # SOLUTION
    
    # third dataframe: remaining random items
    third_df = random_items.iloc[1:] # SOLUTION

    # concatenate all dataframes
    slate = pd.concat([first_df, second_df, third_df]) # SOLUTION
    ###
    
    return slate

# tests to check if your function is correct
test(selector_advertisement)

🆗 Tests passed ! =)

selector_advertisement(memes, init_users.iloc[0], 3)

cols = ["offensive", "overall_sentiment"]
init_users = generate_users(30)
simulate_and_render(init_users, memes, selector=selector_advertisement, col=cols, nb_steps=30, custom=plot_item_to_promote(item_to_promote, cols))

# We provide this function that computes the euclidean distance between each row of a table (that can contain the items) and a vector (that can be a user).
def dist(table, vec):
    """
    Compute the euclidean distance between all rows of a DataFrame and a Series (or a vector).
    The provided DataFrame and Series must contain only numerical values! No string!
    """
    # converting whatever is given to numpy in order to be able to apply linalg.norm
    rows_ = table.to_numpy(dtype=np.float64)
    vec_ = vec.to_numpy(dtype=np.float64)
    return np.linalg.norm(rows_ - vec_, axis=1)

print(categories)
memes[categories].head()

Index(['humour', 'sarcasm', 'offensive', 'motivational', 'overall_sentiment'], dtype='object')

# sort the memes dataframe on the "humour" feature by decreasing value
sorted_by_humour = memes[categories].sort_values("humour", ascending = False) # WATCH OUT! sort_values does not modify the dataframe but returns a sorted version instead

# display the top lines of the result (the humour values should be 1.0 for all of the rows)
display(sorted_by_humour.head())

# display the first meme in the list, supposedly the funniest
print("One of the supposedly funniest meme in the list:")
show_meme(sorted_by_humour.iloc[0].name)

One of the supposedly funniest meme in the list:

def selector_preferences(items, user, k):
    """
    :param items: dataframe with all items from which to select
    :param user: user for whom to select the items
    :param k: number of items to select
    return: a slate which takes the items that best suit the user's preferences
    """
    ### YOUR CODE HERE
    # put the distance between the user and the item in the "dist_with_user" column of the dataframe
    items["dist_with_user"] = dist(items[categories], user[categories]) # SOLUTION

    # sort the items by increasing distance
    items = items.sort_values("dist_with_user") # SOLUTION

    # return the k items closest to the user
    slate = items.head(k) # SOLUTION
    ###
    
    return slate

# tests to check if your function is correct
test(selector_preferences)

🆗 Tests passed ! =)

selector_preferences(memes, init_users.iloc[0], 3)

init_users = generate_users(30)
cols=["offensive", "motivational"]
simulate_and_render(init_users, memes, selector=selector_preferences, col=cols, custom=custom_items_plotting(memes, cols, nb_items = 100), nb_steps=30)

def selector_preferences_with_50p_random(items, user, k):
    """
    :param items: dataframe with all items from which to select
    :param user: user for whom to select the items
    :param k: number of items to select
    return: a slate that takes the items that best suit the user's preferences 
    and 50% of the time: a random item in first place
    """
    ### YOUR CODE HERE
    # compute the distance between the user and each item
    items["dist_with_user"] = dist(items[categories], user[categories]) # SOLUTION
    
    # sort the items by distance
    items = items.sort_values("dist_with_user") # SOLUTION

    # BEGIN SOLUTION NO PROMPT
    # in 50% of the cases, replace the first two items at the top with a random selection
    if np.random.randint(0, 100) < 50: 
        # keep the k first ordered items except the first two
        remaining_ordered_items = items.iloc[2:k]

        # select two random items from the rest of the dataset (i.e. after the k first ones)
        random_items = items.iloc[k:].sample(2)

        # build the slate
        slate = pd.concat([random_items, remaining_ordered_items]) 
    else:
        # select the first k items from the ordered list
        slate = items.iloc[:k]
    ###
    # END SOLUTION
    """ # BEGIN PROMPT
    # in 50% of the cases, replace the first two items at the top with a random selection
    if ...: 
        # keep the first k ordered items except the first two
        # select two random items from the rest of the dataset (i.e. after the k first ones)
        # build the slate
        slate = ...
    else:
        # select the first k items from the ordered list
        slate = ...
    ###
    """; # END PROMPT
    
    return slate

selector_preferences_with_50p_random(memes, init_users.iloc[0], 3)

init_users = generate_users(30)
cols=["offensive", "motivational"]
simulate_and_render(init_users, memes, selector=selector_preferences_with_50p_random, col=cols, custom=custom_items_plotting(memes, cols, nb_items = 100), nb_steps=30)

# Run this cell before you start the next exercise. It won't work otherwise
memes, init_users = obfuscation(memes, init_users, lambda x: (x**3) * 27, chr, 115)
init_users = generate_users(20)

simulate_and_render(init_users, memes, selector=selector_random, col=["humour", "offensive"], nb_steps=100)

# BEGIN SOLUTION NO PROMPT
# Let's display a sample our list of memes and look at the authors
print(memes.sample(15)) # We can have here a first seeing that there is a problem as we see the same image multiple times

# Let's count the number of memes by each author
print(memes["author"].value_counts()) # Here it is clear that Mallory doesn't play according the same rules. 

# Another case solved!
# END SOLUTION
""" # BEGIN PROMPT
# Do whatever is necessary here detective (⌐■_■)


"""; # END PROMPT

                humour  sarcasm  offensive  motivational  overall_sentiment           author  dist_with_user
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
image_6530.jpg     1.0      1.0       -0.5           1.0                1.0  An Unnamed cell        2.382942
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
"troll.jpeg"       0.0      0.0        0.0           0.0                0.0          Mallory             NaN
author
Mallory            27000
Yohan                276
Walter               271
Xx_D4rkL0rd_xX       269
Mike                 268
Olivia               264
Franck               261
Quentin              256
Ted                  255
Charlie              254
Niaj                 253
Ludovic              253
Bob                  250
Sybil                249
Leander              249
Zakarias             248
Ivan                 246
Chad                 245
Judy                 244
Vanna                243
Alice                242
Roman                242
Heidi                242
Eve                  239
David                239
Kurt                 237
GigaChad             236
Pat                  234
An Unnamed cell      227
Celebrations           1
Name: count, dtype: int64

answer = "Mallory" # SOLUTION
print("This is not the correct answer..." if not verification(answer) else "You got it!")

You got it!

# To revert the changes
memes = memes.drop('"troll.jpeg"')
memes.shape

(6993, 7)

def selector(items, user, k):
    return selector_random(items, user, k)
    return selector_preferences_with_50p_random(items, user, k)
    return selector_advertisement(items, user, k)
    return selector_preferences(items, user, k)
    return ... # your own

# attributes to visualize (x and y): "humour", "sarcasm", "offensive", "motivational", "overall_sentiment"
col = ["humour", "sarcasm"]
nb_steps = 100 # number of steps of the simulation

simulate_and_render(init_users, memes, selector=selector, col=col, nb_steps=nb_steps)

	name	humour	sarcasm	offensive	motivational	overall_sentiment
0	Quentin	0.273923	-0.460427	-0.918053	-0.966945	0.626540
1	Xx_D4rkL0rd_xX	0.213272	0.458993	0.087250	0.870145	0.631707
2	Charlie	-0.994523	0.714809	-0.932829	0.459311	-0.648689
3	Walter	0.082922	-0.400576	-0.154626	-0.943361	-0.751433
4	An Unnamed cell	0.341249	0.294379	0.230770	-0.232645	0.994420

Safety Week 2: Content recommendation¶

Introduction¶

Simulating how users react to content recommendation¶

Modelling choices¶

The memes dataset¶

The user dataset¶

1. A trivial recommendation algorithm: random pick¶

Random sampling in a dataframe¶

Recommending random items¶

2. Recommentation with advertisement¶

Concatenating dataframes¶

Selecting rows in dataframes¶

Including an ad in the recommended items¶

3. Preference-based recommendation¶

Distance to preferences¶

Sorting dataframes¶

Recommending items based on preferences¶

Increasing the variety of recommendations¶

4. Nowhere to be safe?¶

Back to random pick¶

Unexpected effects¶

Investigating the issue¶

[Optional] Implement your own recommendation algorithm¶

Synthesis¶

References¶

More explanations on the simulation¶

	humour	sarcasm	offensive	motivational	overall_sentiment	author
image_name
image_0.png	0.0	-1.0	-1.0	1	1.0	Celebrations
image_1.jpg	1.0	1.0	-1.0	-1	1.0	Bob
image_2.jpeg	-1.0	1.0	-1.0	1	1.0	GigaChad
image_3.JPG	0.5	-1.0	-1.0	-1	0.5	Walter
image_4.png	0.5	-0.5	0.5	1	0.5	Kurt

	humour	sarcasm	offensive	motivational	overall_sentiment	author
image_name
image_3962.jpg	0.5	-1.0	-1.0	1	0.5	Quentin
image_5798.jpg	-0.5	1.0	-1.0	-1	0.5	Judy
image_5947.png	-1.0	1.0	0.5	1	0.5	Ivan

	humour	sarcasm	offensive	motivational	overall_sentiment	author
image_4347.jpeg	0.5	-0.5	-1.0	-1	0.5	Charlie
image_0.png	0.0	-1.0	-1.0	1	1.0	Celebrations
image_4951.jpg	-0.5	1.0	0.5	-1	1.0	Ted

	humour	sarcasm	offensive	motivational	overall_sentiment	author	dist_with_user
image_name
image_1979.png	0.5	-0.5	-0.5	1	1.0	Charlie	0.92945
image_6629.jpg	0.5	-0.5	-0.5	1	1.0	Niaj	0.92945
image_3246.jpg	0.5	-0.5	-0.5	1	1.0	Sybil	0.92945

	humour	sarcasm	offensive	motivational	overall_sentiment	author	dist_with_user
image_name
image_6080.png	1.0	-0.5	1.0	-1	0.5	Yohan	0.913798
image_3823.jpg	-0.5	1.0	-1.0	-1	-0.5	Judy	2.536853
image_6585.jpg	1.0	0.5	0.5	-1	0.5	Ludovic	0.732143