import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


df = pd.read_csv("../DSproject/imdb_top_1000.csv.xls")
df.head(3)


#Find columns with missing data
df.info()
sns.heatmap(df.isna(), cbar=False, cmap="YlGnBu_r")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB

<Axes: >


df.loc[df.isna().any(axis=1), ['Certificate','Meta_score','Gross']]


#Drop rows without "Gross" value
#"Gross" is dependent variable, so I don't simply fill it
df = df[df['Gross'].notna()]
df.shape

(831, 16)


#Fill NaN with mode and median
categorical_columns=df.select_dtypes(include=['object']).columns.tolist()
numerical_columns=df.select_dtypes(include=['int64','float64']).columns.tolist()
print("Categorical:",categorical_columns)
print("Numerical:",numerical_columns)

for column in df:
    if df[column].isnull().any():
        if(column in categorical_columns):
            df[column]=df[column].fillna(df[column].mode()[0])
        else:
            df[column]=df[column].fillna(df[column].median())

Categorical: ['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate', 'Runtime', 'Genre', 'Overview', 'Director', 'Star1', 'Star2', 'Star3', 'Star4', 'Gross']
Numerical: ['IMDB_Rating', 'Meta_score', 'No_of_Votes']


#Review data after cleaning
df.info()
sns.heatmap(df.isna(), cbar=False, cmap="YlGnBu_r")

<class 'pandas.core.frame.DataFrame'>
Index: 831 entries, 0 to 997
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    831 non-null    object 
 1   Series_Title   831 non-null    object 
 2   Released_Year  831 non-null    object 
 3   Certificate    831 non-null    object 
 4   Runtime        831 non-null    object 
 5   Genre          831 non-null    object 
 6   IMDB_Rating    831 non-null    float64
 7   Overview       831 non-null    object 
 8   Meta_score     831 non-null    float64
 9   Director       831 non-null    object 
 10  Star1          831 non-null    object 
 11  Star2          831 non-null    object 
 12  Star3          831 non-null    object 
 13  Star4          831 non-null    object 
 14  No_of_Votes    831 non-null    int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 110.4+ KB

<Axes: >


df['Principal_Genre'] = df['Genre'].apply(lambda x: x.split(',')[0])
df[['Genre','Principal_Genre']].loc[:5,:]


#Check object variables with <20 of unique categories
df1 = pd.DataFrame(df.select_dtypes(include=['object']).nunique()).transpose()
df1.loc[:,df1.loc[0,:]<20]


#Convert "Certificate" from object to category
df['Certificate'] = df['Certificate'].astype('category')

#Convert "Principal_Genre" from object to category
df['Principal_Genre'] = df['Principal_Genre'].astype('category')

#Drop rows with non-digit in "Released_Year"
for ind, value in df['Released_Year'].items():
    if value.isdigit() != True:
        print(df.loc[ind,['Released_Year']])
        df.drop(ind, inplace=True)
        print("\nIndex",ind,"is dropped because it has non-digit.\n")
        
#Convert "Released_Year" from object to int
df['Released_Year'] = df['Released_Year'].astype('int')

#Convert 'Runtime' from object to int
df['Runtime'] = df['Runtime'].apply(lambda x: x.split(" ")[0])
df.rename({'Runtime':'Runtime (min)'}, axis=1, inplace=True)
df['Runtime (min)'] = df['Runtime (min)'].astype('int')

#Convert 'Gross' from object to int
df['Gross'] = df['Gross'].loc[df['Gross'].notna()].apply(lambda x: x.replace(',',''))
df['Gross'] = df['Gross'].astype('int')
df['Gross'] = df['Gross']/1e6
df.rename({'Gross':'Gross (mil $)'}, axis=1, inplace=True)

df.info()

Released_Year    PG
Name: 966, dtype: object

Index 966 is dropped because it has non-digit.

<class 'pandas.core.frame.DataFrame'>
Index: 830 entries, 0 to 997
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Poster_Link      830 non-null    object  
 1   Series_Title     830 non-null    object  
 2   Released_Year    830 non-null    int64   
 3   Certificate      830 non-null    category
 4   Runtime (min)    830 non-null    int64   
 5   Genre            830 non-null    object  
 6   IMDB_Rating      830 non-null    float64 
 7   Overview         830 non-null    object  
 8   Meta_score       830 non-null    float64 
 9   Director         830 non-null    object  
 10  Star1            830 non-null    object  
 11  Star2            830 non-null    object  
 12  Star3            830 non-null    object  
 13  Star4            830 non-null    object  
 14  No_of_Votes      830 non-null    int64   
 15  Gross (mil $)    830 non-null    float64 
 16  Principal_Genre  830 non-null    category
dtypes: category(2), float64(3), int64(3), object(9)
memory usage: 106.7+ KB


print("Before drop: ", df.shape)
df = df.drop(['Poster_Link','Genre','Overview'], axis=1)
print("After drop: ", df.shape)

Before drop:  (830, 17)
After drop:  (830, 14)


f, [ax0,ax1] = plt.subplots(1,2,figsize=(20,6))
sns.set_theme()
sns.histplot(df, x="IMDB_Rating", kde=True, ax=ax0)
ax0.set_title("IMDB_Rating", fontsize=14, fontweight='bold')

sns.histplot(df, x="Gross (mil $)", kde=True, ax=ax1, log_scale=True)
ax1.set_title("Gross (mil $)", fontsize=14, fontweight='bold')

#df.loc[df["IMDB_Rating"]>8.9,"Principal_Genre"]   #Top 5 IMBD rating movies are drama, crime and action genres.
#df.loc[df["Gross (mil $)"]>600,"Principal_Genre"] #Top 5 gross earning movies are action genre.

Text(0.5, 1.0, 'Gross (mil $)')


max(df['Gross (mil $)'])

936.662225


df.select_dtypes(include=['int64','float64']).drop(["IMDB_Rating","Gross (mil $)"],axis=1).loc[:5,:]


def scatterplots(df, num_feature):
    f, [ax0,ax1] = plt.subplots(1,2,figsize=(20,6))
    sns.scatterplot(data=df, x=num_feature, y="IMDB_Rating", ax=ax0, hue="Principal_Genre", legend=False)
    sns.regplot(data=df, x=num_feature, y="IMDB_Rating", ax=ax0, scatter=False)                
    ax0.set_title("IMDB_Rating vs {}".format(num_feature), fontsize=14, fontweight='bold')

    sns.scatterplot(data=df, x=num_feature, y="Gross (mil $)", ax=ax1, hue="Principal_Genre")
    sns.regplot(data=df, x=num_feature, y="Gross (mil $)", ax=ax1, scatter=False) 
    ax1.set_title("Gross (mil $) vs {}".format(num_feature),fontsize=14, fontweight='bold')
    ax1.legend(loc='best', ncol=2, handlelength=0.5)
    return


scatterplots(df, "Released_Year")


scatterplots(df, "Runtime (min)")


scatterplots(df, "Meta_score")


scatterplots(df, "No_of_Votes")


df.select_dtypes(include=['category','object']).drop(columns="Series_Title").loc[:5,:]


def boxplots_sorted_by_yvals(df, catg_feature, sort_by_target1, sort_by_target2):
    
    df_by_catg   = df.groupby([catg_feature])
    sortedlist1 = df_by_catg[sort_by_target1].median().sort_values().keys().tolist()
    sortedlist2 = df_by_catg[sort_by_target2].median().sort_values().keys().tolist()
    
    f, [ax0,ax1] = plt.subplots(1,2,figsize=(16,6))
    sns.boxplot(data=df, x=catg_feature, y="IMDB_Rating", ax=ax0, order=sortedlist1[-15:])
    ax0.set_xticklabels(ax0.get_xticklabels(),rotation = 45)
    ax0.set_title("IMDB_Rating vs {}".format(catg_feature), fontsize=14, fontweight='bold')

    sns.boxplot(data=df, x=catg_feature, y="Gross (mil $)", ax=ax1, order=sortedlist2[-15:])
    ax1.set_xticklabels(ax1.get_xticklabels(),rotation = 45)
    ax1.set_title("Gross (mil $) vs {}".format(catg_feature), fontsize=14, fontweight='bold')
    return [sortedlist1, sortedlist2]


[Cert_list1, Cert_list2] = boxplots_sorted_by_yvals(df, "Certificate", "IMDB_Rating", "Gross (mil $)")
[PGenre_list1, PGenre_list2] = boxplots_sorted_by_yvals(df, "Principal_Genre", "IMDB_Rating", "Gross (mil $)")


[Director_list1, Director_list2] = boxplots_sorted_by_yvals(df, "Director", "IMDB_Rating", "Gross (mil $)")
[Star1_list1, Star1_list2] = boxplots_sorted_by_yvals(df, "Star1", "IMDB_Rating", "Gross (mil $)")
[Star2_list1, Star2_list2] = boxplots_sorted_by_yvals(df, "Star2", "IMDB_Rating", "Gross (mil $)")
[Star3_list1, Star3_list2] = boxplots_sorted_by_yvals(df, "Star3", "IMDB_Rating", "Gross (mil $)")
[Star4_list1, Star4_list2] = boxplots_sorted_by_yvals(df, "Star4", "IMDB_Rating", "Gross (mil $)")


def label_catg(num, sorted_list, catg_feature, new_feature):
    for item in sorted_list:
        df.loc[(df[catg_feature] == item ), new_feature] = num
        num=num+1
    return

def plot_corr_matrix(df, nr_c, targ) :
    
    corr = df.corr()
    corr_abs = corr.abs()
    cols = corr_abs.nlargest(nr_c, targ)[targ].index
    cm = np.corrcoef(df[cols].values.T)
    mask = np.triu(np.ones_like(cm, dtype=bool))

    plt.figure(figsize=(nr_c/2, nr_c/2))
    sns.set(font_scale=1)
    sns.heatmap(cm, mask=mask, linewidths=2, annot=True, square=True, 
                fmt='.2f', annot_kws={'size': 10}, 
                yticklabels=cols.values, xticklabels=cols.values
               )
    plt.show()
    return cols


#Label categorical feature with numbers        
label_catg(1, Cert_list1, "Certificate", "Cert_num")
label_catg(1, PGenre_list1, "Principal_Genre", "PGenre_num")
label_catg(1, Director_list1, "Director", "Director_num")
label_catg(1, Star1_list1, "Star1", "Star1_num")
label_catg(1, Star2_list1, "Star2", "Star2_num")
label_catg(1, Star3_list1, "Star3", "Star3_num")
label_catg(1, Star4_list1, "Star4", "Star4_num")

#Drop non-numerical features
df_copy1 = df.drop(columns=['Series_Title','Certificate','Director','Star1','Star2','Star3','Star4','Principal_Genre'])

#Correlation Matrix
nr_feats=len(df_copy1.columns)
corr_list1 = plot_corr_matrix(df_copy1, nr_feats, "IMDB_Rating")


#Label categorical feature with numbers
label_catg(1, Cert_list2, "Certificate", "Cert_num")
label_catg(1, PGenre_list2, "Principal_Genre", "PGenre_num")
label_catg(1, Director_list2, "Director", "Director_num")
label_catg(1, Star1_list2, "Star1", "Star1_num")
label_catg(1, Star2_list2, "Star2", "Star2_num")
label_catg(1, Star3_list2, "Star3", "Star3_num")
label_catg(1, Star4_list2, "Star4", "Star4_num")

#Drop non-numerical features
df_copy2 = df.drop(columns=['Series_Title','Certificate','Director','Star1','Star2','Star3','Star4','Principal_Genre'])

#Correlation Matrix
nr_feats=len(df_copy2.columns)
corr_list2 = plot_corr_matrix(df_copy2, nr_feats, "Gross (mil $)")


from sklearn.linear_model import LinearRegression
model = LinearRegression()

columns = ['Star4_num','Star3_num','Star2_num','Star1_num','Director_num','No_of_Votes','Meta_score','Runtime (min)','Cert_num','Released_Year','PGenre_num']

X = df[columns]
y= df['IMDB_Rating']
#y= df['Gross (mil $)']

model.fit(X,y)
model.score(X,y) #For better accuracy prediction, more datasets shall be included in training model.

0.59930068942258


pd.DataFrame(zip(corr_list1, corr_list2),columns=[corr_list1[0], corr_list2[0]]).drop([0])


copy = df.set_index('Series_Title')
pd.DataFrame(copy.groupby("Principal_Genre")['Star4_num'].nlargest(5))
#pd.DataFrame(copy.groupby("Principal_Genre")['Director_num'].nlargest(5))
#pd.DataFrame(copy.groupby("Principal_Genre")['No_of_Votes'].nlargest(5))

#copy[copy['Star4_num']==781] #Show a movie list of the top star
#copy[copy['Director_num']==472] #Show a movie list of the top director

	Poster_Link	Series_Title	Released_Year	Certificate	Runtime	Genre	IMDB_Rating	Overview	Meta_score	Director	Star1	Star2	Star3	Star4	No_of_Votes	Gross
0	https://m.media-amazon.com/images/M/MV5BMDFkYT...	The Shawshank Redemption	1994	A	142 min	Drama	9.3	Two imprisoned men bond over a number of years...	80.0	Frank Darabont	Tim Robbins	Morgan Freeman	Bob Gunton	William Sadler	2343110	28,341,469
1	https://m.media-amazon.com/images/M/MV5BM2MyNj...	The Godfather	1972	A	175 min	Crime, Drama	9.2	An organized crime dynasty's aging patriarch t...	100.0	Francis Ford Coppola	Marlon Brando	Al Pacino	James Caan	Diane Keaton	1620367	134,966,411
2	https://m.media-amazon.com/images/M/MV5BMTMxNT...	The Dark Knight	2008	UA	152 min	Action, Crime, Drama	9.0	When the menace known as the Joker wreaks havo...	84.0	Christopher Nolan	Christian Bale	Heath Ledger	Aaron Eckhart	Michael Caine	2303232	534,858,444

	Genre	Principal_Genre
0	Drama	Drama
1	Crime, Drama	Crime
2	Action, Crime, Drama	Action
3	Crime, Drama	Crime
4	Crime, Drama	Crime
5	Action, Adventure, Drama	Action

	Released_Year	Runtime (min)	Meta_score	No_of_Votes
0	1994	142	80.0	2343110
1	1972	175	100.0	1620367
2	2008	152	84.0	2303232
3	1974	202	90.0	1129952
4	1957	96	96.0	689845
5	2003	201	94.0	1642758

	Certificate	Director	Star1	Star2	Star3	Star4	Principal_Genre
0	A	Frank Darabont	Tim Robbins	Morgan Freeman	Bob Gunton	William Sadler	Drama
1	A	Francis Ford Coppola	Marlon Brando	Al Pacino	James Caan	Diane Keaton	Crime
2	UA	Christopher Nolan	Christian Bale	Heath Ledger	Aaron Eckhart	Michael Caine	Action
3	A	Francis Ford Coppola	Al Pacino	Robert De Niro	Robert Duvall	Diane Keaton	Crime
4	U	Sidney Lumet	Henry Fonda	Lee J. Cobb	Martin Balsam	John Fiedler	Crime
5	U	Peter Jackson	Elijah Wood	Viggo Mortensen	Ian McKellen	Orlando Bloom	Action

		Star4_num
Principal_Genre	Series_Title
Action	Avatar	781.0
	Avengers: Endgame	780.0
	Avengers: Infinity War	780.0
	Thor: Ragnarok	780.0
	The Avengers	778.0
Adventure	Harry Potter and the Deathly Hallows: Part 2	764.0
	Harry Potter and the Sorcerer's Stone	755.0
	The Hobbit: An Unexpected Journey	751.0
	Harry Potter and the Goblet of Fire	746.0
	Jaws	739.0
Animation	Incredibles 2	777.0
	Toy Story 4	772.0
	The Lion King	771.0
	Toy Story 3	770.0
	Finding Nemo	768.0
Biography	The Blind Side	735.0
	Bohemian Rhapsody	714.0
	Hidden Figures	688.0
	Catch Me If You Can	684.0
	The Pursuit of Happyness	683.0
Comedy	Home Alone	744.0
	The Hangover	743.0
	Knives Out	685.0
	The Sting	678.0
	La La Land	674.0
Crime	Joker	762.0
	Ocean's Eleven	694.0
	The Green Mile	665.0
	The Departed	663.0
	The Silence of the Lambs	659.0
Drama	Titanic	779.0
	Forrest Gump	759.0
	The Sixth Sense	749.0
	Gravity	742.0
	Saving Private Ryan	715.0
Family	E.T. the Extra-Terrestrial	773.0
Family	Willy Wonka & the Chocolate Factory	210.0
Film-Noir	The Maltese Falcon	168.0
Film-Noir	The Third Man	77.0
Horror	The Exorcist	726.0
	Get Out	631.0
	The Others	612.0
	Alien	594.0
	Saw	533.0
Mystery	Shutter Island	655.0
	Twelve Monkeys	540.0
	Rear Window	455.0
	Memento	412.0
	Dark City	346.0
Thriller	Wait Until Dark	366.0
Western	The Outlaw Josey Wales	465.0
	Per qualche dollaro in più	350.0
	Il buono, il brutto, il cattivo	265.0
	Once Upon a Time in the West	247.0

EDA on IMDb Movies Dataset¶

IMDb Movies Dataset¶

Data Source:¶

Data Content:¶

Exploration Data Analysis (EDA)¶

1. Obtain data from data source¶

Obtain data¶

Overview data¶

2. Clean data to machine readable format¶

Handling missing data¶

Generate new column¶

Fix data types¶

Drop unuseful columns¶

3. Explore data patterns using seaborn and matpoltlib¶

Dependent variables¶

Remark:¶

Relation of independent variables (numerical) to dependent¶

Remark:¶

Relation of independent variables (categorical) to dependent¶

Remark:¶

Remark:¶

Data wrangling for correlation matrix¶

Remark:¶

Data wrangling for correlation matrix¶

Remark:¶

4. Train model to forecast IMDB_Rating and Gross¶

Remark:¶

5. Interpret data patterns to solve Task 1 and Task 2¶

Task 1 : What type of movies are getting good rating and gross amount?¶

Remark:¶

Task 2 : Mention the top 5 movies names according to different genres.¶

	Certificate	Meta_score	Gross
18	PG-13	90.0	NaN
20	U	NaN	NaN
30	NaN	85.0	NaN
32	PG	89.0	NaN
46	U	94.0	NaN
...	...	...	...
993	A	82.0	NaN
995	A	76.0	NaN
996	G	84.0	NaN
998	NaN	78.0	NaN
999	NaN	93.0	NaN

	IMDB_Rating	Gross (mil $)
1	Star4_num	Star4_num
2	Star3_num	Star3_num
3	Star2_num	Star2_num
4	Star1_num	Director_num
5	Director_num	Star1_num
6	No_of_Votes	No_of_Votes
7	Meta_score	PGenre_num
8	Runtime (min)	Cert_num
9	Cert_num	Released_Year
10	Released_Year	Runtime (min)
11	Gross (mil $)	IMDB_Rating
12	PGenre_num	Meta_score