Analysis of ratings on animated throughout the years#

Imports and useful functions#

[1]:

from matplotlib import ticker
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import os
import re
matplotlib.rc('font', size=13)

[2]:

def get_release_year(df):
    try:
        year = int(re.search(r'\((?=19|20)\d{4}\)', df).group()[1:-1])
    except AttributeError:
        year = 0
    return year

def get_percent_of_total(df):
    total = df['count'].sum()
    percent = df['count'] / total * 100
    return percent

def align_y_axis(ax1, ax2):
    '''
    Align two matplolib.axes.Axes objects such that the
    0's on the y-axis of the plots are aligned correctly.
    This function works such that the plots are zoomed out
    by a ratio that will force the 0's to align.

    Args:
        ax1 (matplotlib.axes.Axes): Axes object from the
            plot.
        ax2 (maptlotlib.axes.Axes): Axes object from the
            plot. Typically a twinx object.
    '''
    axes = (ax1, ax2)
    extrema = [ax.get_ylim() for ax in axes]
    tops = [extr[1] / (extr[1] - extr[0]) for extr in extrema]
    if tops[0] > tops[1]:
        axes, extrema, tops = [list(reversed(l)) for l in (axes, extrema, tops)]
    tot_span = tops[1] + 1 - tops[0]
    b_new_t = extrema[0][0] + tot_span * (extrema[0][1] - extrema[0][0])
    t_new_b = extrema[1][1] - tot_span * (extrema[1][1] - extrema[1][0])
    axes[0].set_ylim(extrema[0][0], b_new_t)
    axes[1].set_ylim(t_new_b, extrema[1][1])

def generate_plot(df, figidx, votes=True, numAnimMovies=None):
    fig = plt.figure(figidx, figsize=(20, 15), dpi=75)
    # plot the statistics of the data
    ax = fig.add_subplot(311)
    twin1 = ax.twinx()
    data = df.loc[~df.index.isin([0])]
    xdata = data.index.values
    p1, = ax.plot(xdata, data['mean'], color='tab:blue',
                  label='Average', marker='o')
    p2, = twin1.plot(xdata, data['std'], color='tab:green',
                     label='Standard Deviation', marker='^')
    ax.tick_params(axis='y', colors=p1.get_color())
    twin1.tick_params(axis='y', colors=p2.get_color())
    ax.legend(handles=[p1,p2], ncol=1, loc='lower left')
    ax.xaxis.set_label_position('top')
    ax.xaxis.tick_top()
    ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
    ax.xaxis.set_minor_locator(ticker.MultipleLocator(1))
    ax.grid(axis='x')
    ax.set_ylabel('Ratings out of 5', color=p1.get_color())
    twin1.set_ylabel('Standard Deviation', color=p2.get_color())
    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.2f'))
    twin1.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.2f'))
    ax.set_xlabel('Year of release')
    # plot the other statistics for min, 25% , 50%, 75%, and max
    ax = fig.add_subplot(312)
    ax.axhline(0, color='k', linewidth=0.7)
    p1, = ax.plot(xdata, data['min'], label='Minimum', marker='o')
    p2, = ax.plot(xdata, data['25%'], label='25$^{th}$ percentile', marker='^')
    p3, = ax.plot(xdata, data['50%'], label='Median', marker='v')
    p4, = ax.plot(xdata, data['75%'], label='75$^{th}$ percentile', marker='s')
    p5, = ax.plot(xdata, data['max'], label='Maximum', marker='D')
    p6, = ax.plot(xdata, data['10%'], label='10$^{th}$ percentile', marker='+')
    p7, = ax.plot(xdata, data['90%'], label='90$^{th}$ percentile', marker='x')
    ax.legend(handles=[p1, p3, p5, p6, p2, p4, p7], ncol=7)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
    ax.xaxis.set_minor_locator(ticker.MultipleLocator(1))
    ax.set_xticklabels([])
    ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.2f'))
    ax.set_ylabel('Ratings out of 5')
    ax.grid(axis='x')
    ax.set_ylim([-0.25, 5.25])
    ax.yaxis.set_minor_locator(ticker.MultipleLocator(0.5))
    # plot the number of movies released and the number of votes depending on bool value
    ax = fig.add_subplot(313)
    if votes:
        twin1 = ax.twinx()
        p1, = ax.plot(xdata, data['count'], color='tab:red',
                      label='Total Votes', marker='s')
        ndata = numAnimMovies.loc[~numAnimMovies.index.isin([0])].sort_index()
        p2, = twin1.plot(ndata.index.values, ndata.values, color='tab:cyan',
                         label='Movies released', marker='v')
        ax.tick_params(axis='y', colors=p1.get_color())
        twin1.tick_params(axis='y', colors=p2.get_color())
        ax.legend(handles=[p1,p2], ncol=1, loc='upper left')
        ax.set_xlabel('Year of release')
        ax.set_ylabel('Number of Votes', color=p1.get_color())
        twin1.set_ylabel('Number of Movies Released', color=p2.get_color())
    else:
        ax.plot(xdata, data['count'], label='Movies released', marker='o')
        ax.set_ylabel('Number of Movies Released')
        ax.legend()
    ax.grid(axis='x')
    ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
    ax.xaxis.set_minor_locator(ticker.MultipleLocator(1))
    ax.set_xlabel('Year of release')
    fig.subplots_adjust(hspace=0)
    return fig

Parse the data#

[3]:

dfs = {}
dirs = dict(large='ml-latest')
for key, parent in dirs.items():
    dfs[key] = {}
    for fn in os.listdir(parent):
        if not fn.endswith('.csv'): continue
        if fn.startswith('genome'): continue
        sub_key = fn.replace('.csv', '')
        dtypes = dict(userId='category', movieId='category',
                      imdbId=str, tmdbId=str)
        filename = os.path.join(parent, fn)
        dfs[key][sub_key] = pd.read_csv(filename, dtype=dtypes)
all_data = dfs

[4]:

main = 'imdb-data'
parent = 'title-ratings'
dtypes = dict(tconst=str, numVotes=int)
fn = os.path.join(main, parent, 'data.tsv')
df = pd.read_csv(fn, sep='\t', dtype=dtypes)
df['tconst'] = df['tconst'].apply(lambda x: x[2:])
df = df.merge(all_data['large']['links'], left_on='tconst', right_on='imdbId', how='right')
all_data['imdb-ratings'] = df

[5]:

data_df = all_data['large'].copy()
data_df['ratings']['date'] = pd.to_datetime(data_df['ratings']['timestamp'], unit='s')

[6]:

data_df['movies']['releaseYear'] = data_df['movies']['title'].apply(get_release_year)

[7]:

ratings_df = data_df['ratings'].merge(data_df['movies'], on='movieId')
contains = ratings_df['genres'].str.contains('Animation')
anim_df = ratings_df[contains]

Average ratings for a given time period#

Here, we will see the average ratings of movies for every two months.

[8]:

grouped = anim_df.groupby(pd.Grouper(freq='2M', key='date'))
df = grouped['rating'].describe()
df['uniqueMovies'] = grouped['title'].nunique()
df['uniqueUsers'] = grouped['userId'].nunique()
df['ratingsPerUser'] = df['count'] / df['uniqueUsers']
fig = plt.figure(1, figsize=(20,10), dpi=75)
# average and standard deviation
ax = fig.add_subplot(211)
twin1 = ax.twinx()
xdata = [x.year+x.month/12 for x in df.index]
p1, = ax.plot(xdata, df['mean'], color='tab:blue',
              label='Average', marker='o')
p2, = twin1.plot(xdata, df['std'], color='tab:green',
                 label='Standard Deviation', marker='^')
ax.xaxis.set_label_position('top')
ax.xaxis.tick_top()
ax.tick_params(axis='y', colors=p1.get_color())
twin1.tick_params(axis='y', colors=p2.get_color())
ax.legend(handles=[p1, p2], loc='lower center', ncol=2)
ax.xaxis.set_major_locator(ticker.MultipleLocator(2))
ax.xaxis.set_minor_locator(ticker.MultipleLocator(0.5))
ax.grid(axis='x')
ax.set_ylabel('Ratings out of 5', color=p1.get_color())
twin1.set_ylabel('Standard Deviation', color=p2.get_color())
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.2f'))
twin1.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.2f'))
ax.set_xlabel('Date')
# number of votes and distinct movies voted for
ax = fig.add_subplot(212)
twin1 = ax.twinx()
twin2 = ax.twinx()
ax.axhline(0, color='k', linewidth=0.7)
p1, = ax.plot(xdata, df['count'], color='tab:red',
              label='Number of ratings', marker='s')
p2, = twin1.plot(xdata, df['uniqueMovies'], color='tab:cyan',
                 marker='D', label='Unique movies rated')
p3, = twin2.plot(xdata, df['ratingsPerUser'], marker='v',
                color='tab:orange', label='Ratings per user')
twin2.spines.right.set_position(("axes", 1.07))
ax.xaxis.set_major_locator(ticker.MultipleLocator(2))
ax.xaxis.set_minor_locator(ticker.MultipleLocator(0.5))
twin1.yaxis.set_minor_locator(ticker.MultipleLocator(100))
ax.legend(handles=[p1, p2, p3], loc='upper center', ncol=2)
ax.tick_params(axis='y', colors=p1.get_color())
twin1.tick_params(axis='y', colors=p2.get_color(), which='both')
twin2.tick_params(axis='y', colors=p3.get_color(), which='both')
ax.grid(axis='x')
ax.set_ylabel('Number of Ratings', color=p1.get_color())
twin1.set_ylabel('Number of unique movies rated', color=p2.get_color())
twin2.set_ylabel('Ratings per user', color=p3.get_color())
align_y_axis(ax, twin2)
align_y_axis(ax, twin1)
ax.set_xlabel('Date')
fig.subplots_adjust(hspace=0)

../_images/movie-analysis_anim_movies_analysis_11_0.png

[9]:

df.head(15)

[9]:

	count	mean	std	min	25%	50%	75%	max	uniqueMovies	uniqueUsers	ratingsPerUser
date
1996-01-31	4.0	4.250000	0.500000	4.0	4.0	4.0	4.25	5.0	2	3	1.333333
1996-03-31	438.0	4.280822	0.845582	1.0	4.0	4.0	5.00	5.0	16	237	1.848101
1996-05-31	15752.0	3.783329	0.982402	1.0	3.0	4.0	5.00	5.0	22	5254	2.998097
1996-07-31	28920.0	3.677663	0.946787	1.0	3.0	4.0	4.00	5.0	24	8549	3.382852
1996-09-30	21173.0	3.620413	0.935787	1.0	3.0	4.0	4.00	5.0	31	6349	3.334856
1996-11-30	23532.0	3.709969	0.958391	1.0	3.0	4.0	4.00	5.0	38	6831	3.444884
1997-01-31	13769.0	3.744644	0.944654	1.0	3.0	4.0	5.00	5.0	39	4900	2.810000
1997-03-31	8235.0	3.696296	0.976493	1.0	3.0	4.0	4.00	5.0	40	3266	2.521433
1997-05-31	8620.0	3.640951	0.986023	1.0	3.0	4.0	4.00	5.0	40	3294	2.616879
1997-07-31	7128.0	3.633418	0.992685	1.0	3.0	4.0	4.00	5.0	41	2505	2.845509
1997-09-30	828.0	3.650966	1.053657	1.0	3.0	4.0	4.00	5.0	40	198	4.181818
1997-11-30	2317.0	3.589987	1.127932	1.0	3.0	4.0	4.00	5.0	41	511	4.534247
1998-01-31	1758.0	3.543231	1.096436	1.0	3.0	4.0	4.00	5.0	42	425	4.136471
1998-03-31	1243.0	3.494771	1.110162	1.0	3.0	4.0	4.00	5.0	42	319	3.896552
1998-05-31	894.0	3.699105	1.088502	1.0	3.0	4.0	5.00	5.0	42	246	3.634146

Here, we can see that there were not many ratings made for animated movies in 1996 and the average ratings were quite high with a small sample size. After that small period, the average ratings dropped significantly and remained fairly constant not changing greately even as the number of unique movies rated and the ratings per user increased. But, this is showing the behavior of all the movies that were available on or before the date the reviews were made. What about how older animated movies have performed vs. newer ones?

Average ratings of movies released in one calendar year#

Here, I will try to show how older animated films compare to newer animated films. I am interested in this as older movies had to be hand-drawn whereas newer movies rely much more on 3D animation with the rise of computers and the decrease in cost of 3D animation. I will achieve this by taking all the movies that were released in one calendar year and take the average of all the ratings made for those animated movies. I am defining the calendar year just by the year that each movie was released. So, if a movie was released in January of 1995 and another was released in December of 1995 they will placed in the group of movies released in 1995.

[10]:

contains = data_df['movies']['genres'].str.contains('Animation')
grouped = data_df['movies'][contains].groupby('releaseYear')
numAnimMovies = grouped['title'].count().sort_values(ascending=False)

[11]:

grouped = anim_df.groupby('releaseYear')
df = grouped['rating'].describe(percentiles=[.1, .25, .5, .75, .9])
ratingByRelease = df

[12]:

fig = generate_plot(ratingByRelease, 1, True, numAnimMovies)

../_images/movie-analysis_anim_movies_analysis_17_0.png

[13]:

highestCount = ratingByRelease.sort_values(by=['count'], ascending=False).index.values
ratingByRelease.sort_values(by=['count'], ascending=False).head(9)

[13]:

	count	mean	std	min	10%	25%	50%	75%	90%	max
releaseYear
2001	175892.0	3.788248	1.010258	0.5	2.5	3.0	4.0	4.5	5.0	5.0
1995	131964.0	3.759938	1.025821	0.5	2.5	3.0	4.0	4.5	5.0	5.0
2004	118732.0	3.619854	1.055222	0.5	2.0	3.0	4.0	4.5	5.0	5.0
2009	92743.0	3.702824	1.007579	0.5	2.5	3.0	4.0	4.5	5.0	5.0
2010	89128.0	3.709693	1.014302	0.5	2.5	3.0	4.0	4.5	5.0	5.0
2008	87883.0	3.744797	1.016023	0.5	2.5	3.0	4.0	4.5	5.0	5.0
1999	85982.0	3.697914	1.033350	0.5	2.5	3.0	4.0	4.5	5.0	5.0
1998	79688.0	3.418520	1.023038	0.5	2.0	3.0	3.5	4.0	5.0	5.0
2007	78187.0	3.490158	1.047424	0.5	2.0	3.0	3.5	4.0	5.0	5.0

Here, we show a plot of the average all time ratings (blue line), the respective standard deviation (green line), total number of ratings (red line), and number of animated movies released in that calendar year (cyan line). What I can infer based on this data is that animated movies, irregardless of when they were released, seem to actually have pretty consistent ratings oscillating around 3.4. The standard deviation is a bit all over the place for animated movies released prior to 1930. The standard deviation becomes more consistent after 1930 and seems to revolve around 1.0.

We can also see that the number of animated movies that have been released for each calendar year have actually risen pretty dramatically from a maximum 50 from 1930 to 1955 to approximately 170 in 2019. I suspect a lot of this has to do with the increase in computing power that creators have available which can lead to lower production costs as the 3D animated movies at the end of the 20th century become more common than their traditional counter parts that were hand-drawn.

Study on individual years#

Now what I want to look at is the data for the individual years instead of the entire span of time that there is data available for. If we look at the plot the we generated above we can see that there are a few peaks in the number of ratings made for animated movies that came out in 1988, 1995, 2001, 2004, and 2008 to 2010. What I’m wondering is if there is any couple movies that were especially popular that came out in that time period that may be significantly increasing the total number of votes. Along with that I want to see if those movies are significantly affecting the average rating for that year.

So let’s start with fetching the data that we need.

[14]:

years = [1988, 1995, 2001, 2004, 2008, 2009, 2010]
data = anim_df[anim_df['releaseYear'].isin(years)].copy()
select_data = data.groupby('movieId')['rating'].describe() \
                .merge(data[['movieId', 'releaseYear', 'title']].drop_duplicates(),
                       on='movieId', how='left')
contains = data_df['movies']['genres'].str.contains('Animation')
percentTotal = select_data.groupby('releaseYear').apply(get_percent_of_total) \
                        .reset_index().set_index('level_1').drop('releaseYear', axis=1)
select_data['percentTotal'] = percentTotal

Movies with the highest number of ratings#

[15]:

disp_cols = ['releaseYear', 'mean', 'std', 'count', 'percentTotal']
df = select_data.sort_values(by=['count'], ascending=False).groupby('releaseYear').head(5) \
        .sort_values(by=['releaseYear', 'count'], ascending=[True, False]).set_index('title')[disp_cols].copy()
df['count'] = df['count'].astype(int)
df.style.background_gradient(axis=0, gmap=df['releaseYear'])

[15]:

	releaseYear	mean	std	count	percentTotal
title
Who Framed Roger Rabbit? (1988)	1988	3.543433	0.949754	26627	39.398961
My Neighbor Totoro (Tonari no Totoro) (1988)	1988	4.163490	0.858451	14010	20.730065
Akira (1988)	1988	3.934376	0.937954	12122	17.936463
Grave of the Fireflies (Hotaru no haka) (1988)	1988	4.101209	0.911399	6946	10.277733
Oliver & Company (1988)	1988	3.316730	0.949151	3443	5.094476
Toy Story (1995)	1995	3.893508	0.929105	76813	58.207541
Pocahontas (1995)	1995	2.978704	1.074605	17562	13.308175
Wallace & Gromit: A Close Shave (1995)	1995	4.096216	0.976068	14587	11.053772
Ghost in the Shell (Kôkaku kidôtai) (1995)	1995	3.990715	0.905179	10986	8.324998
Goofy Movie, A (1995)	1995	3.126086	1.057361	4489	3.401685
Shrek (2001)	2001	3.748595	0.956309	58529	33.275533
Monsters, Inc. (2001)	2001	3.840528	0.889162	48441	27.540195
Spirited Away (Sen to Chihiro no kamikakushi) (2001)	2001	4.226035	0.909657	35375	20.111773
Final Fantasy: The Spirits Within (2001)	2001	3.074195	1.064298	8727	4.961567
Atlantis: The Lost Empire (2001)	2001	3.368157	0.975421	5279	3.001274
Incredibles, The (2004)	2004	3.850139	0.912405	42953	36.176431
Shrek 2 (2004)	2004	3.478163	1.005143	26972	22.716707
Howl's Moving Castle (Hauru no ugoku shiro) (2004)	2004	4.118815	0.879780	16471	13.872419
Team America: World Police (2004)	2004	3.456382	1.079482	8689	7.318162
Polar Express, The (2004)	2004	3.088515	1.087693	5146	4.334131
WALL·E (2008)	2008	4.013953	0.895623	42033	47.828363
Kung Fu Panda (2008)	2008	3.626686	0.983823	17050	19.400794
Ponyo (Gake no ue no Ponyo) (2008)	2008	3.847453	0.872578	5359	6.097880
Bolt (2008)	2008	3.268886	0.963682	5017	5.708726
Madagascar: Escape 2 Africa (2008)	2008	3.187572	1.031003	3452	3.927950
Up (2009)	2009	3.960453	0.886525	38751	41.783207
Fantastic Mr. Fox (2009)	2009	3.894344	0.912491	9990	10.771702
Coraline (2009)	2009	3.749773	0.924659	9933	10.710242
Cloudy with a Chance of Meatballs (2009)	2009	3.339916	0.985740	5494	5.923897
9 (2009)	2009	3.457567	0.942510	4513	4.866135
Toy Story 3 (2010)	2010	3.832119	0.994369	21131	23.708599
How to Train Your Dragon (2010)	2010	3.905903	0.926267	20872	23.418006
Despicable Me (2010)	2010	3.659673	0.993618	14561	16.337178
Tangled (2010)	2010	3.727357	0.963097	11869	13.316803
Megamind (2010)	2010	3.563637	0.982083	8352	9.370793

Movies with the lowest number of ratings#

[16]:

disp_cols = ['releaseYear', 'mean', 'std', 'count', 'percentTotal']
df = select_data.sort_values(by=['count'], ascending=False).groupby('releaseYear').tail(5) \
        .sort_values(by=['releaseYear', 'count'], ascending=[True, False]).set_index('title')[disp_cols].copy()
df['count'] = df['count'].astype(int)
df.style.background_gradient(axis=0, gmap=df['releaseYear'])

[16]:

	releaseYear	mean	std	count	percentTotal
title
Urusei Yatsura: The Final Chapter (1988)	1988	2.500000	nan	1	0.001480
Tokyo The Last Megalopolis (1988)	1988	3.000000	nan	1	0.001480
Winter (1988)	1988	2.500000	nan	1	0.001480
Self Portrait (1988)	1988	3.500000	nan	1	0.001480
Snoopy: The Musical (1988)	1988	2.000000	nan	1	0.001480
Pib and Pog (1995)	1995	4.000000	0.000000	2	0.001516
Achilles (1995)	1995	2.500000	1.414214	2	0.001516
Landlock (1995)	1995	3.000000	nan	1	0.000758
Legend of Crystania: The Motion Picture (1995)	1995	3.000000	nan	1	0.000758
Elf Princess Ren (1995)	1995	2.500000	nan	1	0.000758
Helicopter (2001)	2001	3.000000	nan	1	0.000569
A Christmas Adventure ...From a Book Called Wisely's Tales (2001)	2001	3.500000	nan	1	0.000569
Mister Blot's Triumph (2001)	2001	0.500000	nan	1	0.000569
Attraction (2001)	2001	3.500000	nan	1	0.000569
ReBoot - My Two Bobs (2001)	2001	0.500000	nan	1	0.000569
L'île de Black Mór (2004)	2004	3.000000	nan	1	0.000842
Grrl Power! (2004)	2004	2.000000	nan	1	0.000842
King of Fools (2004)	2004	4.000000	nan	1	0.000842
VeggieTales: Sumo of the Opera (2004)	2004	4.000000	nan	1	0.000842
Flatlife (2004)	2004	3.500000	nan	1	0.000842
Stand Up (2008)	2008	1.000000	nan	1	0.001138
That Lazy Boy (2008)	2008	4.000000	nan	1	0.001138
Judas & Jesus (2008)	2008	4.000000	nan	1	0.001138
Moomin and Midsummer Madness (2008)	2008	5.000000	nan	1	0.001138
Deconstruction Workers (2008)	2008	4.000000	nan	1	0.001138
Alice's Birthday (2009)	2009	3.000000	nan	1	0.001078
Wide Open Spaces (2009)	2009	1.500000	nan	1	0.001078
A Family Portrait (2009)	2009	3.500000	nan	1	0.001078
Gift of the Hoopoe (2009)	2009	1.500000	nan	1	0.001078
Heavenly Appeals (2009)	2009	2.500000	nan	1	0.001078
Gravity was everywhere back then (2010)	2010	4.000000	nan	1	0.001122
Toonpur Ka Superrhero (2010)	2010	3.000000	nan	1	0.001122
Chainsaw Maid 2 (2010)	2010	0.500000	nan	1	0.001122
Rabid Rider (2010)	2010	3.500000	nan	1	0.001122
Bob the Builder: Legend of the Golden Hammer (2010)	2010	3.000000	nan	1	0.001122

Clearly, we can see that there are certain years where one movie in particular was extremely popular and the votes were heavily weighted to that one movie. A prime example of this would be Toy Story (1995) which received over 58% of the total ratings cast for movies that were released in that year. So does this then say that we should be weighing the average ratings for movies for a specific year differently?

What I will now try to do is to remove the dependence on the popularity (number of ratings) of a certain movie and get an average of the movies for a specific year irregardless of the number of ratings made. I believe this will be a more accurate representation of how animated films have performed during a specific calendar year as it weighs the most popular and unpopular movies evenly.

Analysis on the average of the average#

Here, what I will do is get an average of all the available ratings for each movie separately and then group the movies by the release year and get the average of the average ratings for movies released each calendar year.

[17]:

tmp = anim_df.copy()
cols = ['movieId', 'releaseYear', 'title']
avgRatings = tmp.groupby('movieId')['rating'].describe() \
        .merge(anim_df[cols].drop_duplicates(), on='movieId', how='left') \
        .groupby('releaseYear')['mean'].describe(percentiles=[.1, .25, .5, .75, .9])

[18]:

fig = generate_plot(avgRatings, 2, False)

../_images/movie-analysis_anim_movies_analysis_29_0.png

What we can see, comparing to the previous figure, is that overall the average goes down slightly. This could come from how the median drops from 4 to approximately 3 and the minimum value having a steady decrease over the years. However, the 25th and 75th percentiles stay fairly close to the median not exceeding a value less than or greater than 0.5, respectively. Also, the 10th and 90th percentiles are a bit further apart from the minimum and maximum, respectively, which seems to point to a more evenly distributed data set.

Overall, I believe that this gives a much better description of how animated films have performed as we have removed the popularity bias from the data. However, we are also giving more power to those who rated unpopular movies, so the opinions of those people seem to have more power which opens up the trends to their bias.

Conclusion#

My final conclusion, based on this data, is that animated actually seem to have performed farily well throughout the years. It would be interesting to see if there is a way to group the movies based on the studio that produced them, such as, Pixar, Dreamworks, Disney (before acquiring Pixar), and Studio Ghibli, and see how they have performed individually.

As with any other data set, there is no perfect method that one can use to interpret results and overall trends, but as long we do our due dilligence and try to take into account all the factors that can affect data and test the different methods we should be able to draw reasonable conclusions on the data. Thank you for coming along with me on this magical journey and I hope that you enjoyed this small analysis that I have made on the overall progress of animated movies throughout the years.

Analysis of ratings on animated throughout the years

Contents