import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

base='/kaggle/input/movielens-20m-dataset/'

# Load datasets

df_movie = pd.read_csv(base+'movie.csv')
df_rating = pd.read_csv(base+'rating.csv')
df_tags = pd.read_csv(base+'tag.csv')

df_movie.head()

df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB

df_movie.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

df_movie.duplicated().sum()

0

df_rating.head()

print(df_rating.shape)

(20000263, 4)

df_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 610.4+ MB

df_rating.duplicated().sum()

0

df_rating.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

df_tags.head()

df_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465564 entries, 0 to 465563
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   userId     465564 non-null  int64 
 1   movieId    465564 non-null  int64 
 2   tag        465548 non-null  object
 3   timestamp  465564 non-null  object
dtypes: int64(2), object(2)
memory usage: 14.2+ MB

df_tags.isna().sum()

userId        0
movieId       0
tag          16
timestamp     0
dtype: int64

df_tags.duplicated().sum()

0

jumlah_film = len(df_movie.movieId.unique())
print(f"Jumlah total film: {jumlah_film}")

Jumlah total film: 27278

# Ekstrak tahun dari kolom title
df_movie['year'] = df_movie['title'].str.extract(r'\((\d{4})\)')
df_movie['year'] = df_movie['year'].fillna(0).astype(int)

# Menampilkan nilai min dan max dari tahun
year_min = df_movie[df_movie['year'] > 0]['year'].min()
year_max = df_movie['year'].max()

print("Tahun Min:", year_min)
print("Tahun Max:", year_max)

Tahun Min: 1891
Tahun Max: 2015

jumlah_user = len(df_rating.userId.unique())
print(f"Jumlah total user: {jumlah_user}")

Jumlah total user: 138493

# Plot distribution of movie genres
genre_counts = df_movie['genres'].str.split('|', expand=True).stack().value_counts()
plt.figure(figsize=(8, 4))
sns.barplot(x=genre_counts.index, y=genre_counts.values)
plt.xticks(rotation=90)
plt.title('Distribution of Movie Genres')
plt.xlabel('Genres')
plt.ylabel('Count')
plt.show()

df_rating['timestamp'] = pd.to_datetime(df_rating['timestamp'])
df_rating.replace([float('inf'), float('-inf')], float('nan'), inplace=True)

# Visualisasi Distribusi Rating
plt.figure(figsize=(8, 4))
sns.histplot(df_rating['rating'], bins=5, kde=False)
plt.title('Distribusi Rating')
plt.xlabel('Rating')
plt.ylabel('Jumlah')
plt.show()

/opt/conda/lib/python3.10/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

df_rating['year_month'] = df_rating['timestamp'].dt.to_period('M')
rating_trend = df_rating.groupby('year_month').size()

plt.figure(figsize=(10, 6))
rating_trend.plot(marker='o')
plt.title('Tren Jumlah Rating Berdasarkan Waktu')
plt.xlabel('Waktu (Year-Month)')
plt.ylabel('Jumlah Rating')
plt.show()

# Memastikan tidak ada data yang kosong agar tidak terjadi error ketika merge
df_movie['genres'] = df_movie['genres'].fillna('Unknown')
df_tags['tag'] = df_tags['tag'].fillna('')

# Menggabungkan semua tag dari satu film menjadi satu string
con_bf = df_tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

# Menggabungkan data movie dengan tag
con_bf = pd.merge(df_movie, con_bf, on='movieId', how='left')
con_bf['tag'] = con_bf['tag'].fillna('')

con_bf.head()

# Menggabungkan genres dan tags menjadi satu kolom
con_bf['combined_features'] = con_bf['genres'] + ' ' + con_bf['tag']

con_bf[['title', 'combined_features']].head()

# Merge movies dan ratings
coll_bf = pd.merge(df_rating, df_movie, on='movieId')

# Menghapus kolom timestamp karena tidak diperlukan
coll_bf = coll_bf.drop(columns=['timestamp'])

coll_bf.head()

import string
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')

# Inisialisasi stemmer, lemmatizer, dan stopwords
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def normalize_text(text):
    # Mengganti tanda garis tegak dan karakter lain dengan spasi, lalu mengubah menjadi huruf kecil
    text = text.replace('|', ' ').lower()
    
    # Menghapus tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Menghapus whitespace berlebih
    text = ' '.join(text.split())
    
    # Lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    # Menghapus stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Mengganti singkatan
    text = text.replace("n't", " not").replace("'re", " are")
    
    return text

con_bf['combined_clean'] = con_bf['combined_features'].apply(normalize_text)

con_bf.head()

con_bf.isnull().sum()

movieId              0
title                0
genres               0
tag                  0
combined_features    0
combined_clean       0
dtype: int64

coll_bf.isnull().sum()

userId     0
movieId    0
rating     0
title      0
genres     0
dtype: int64

con_bf.duplicated().sum()

0

coll_bf.duplicated().sum()

0

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Mengubah teks menjadi vektor menggunakan TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(con_bf['combined_clean'])

# Menghitung kesamaan kosinus
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Rekomendasi film
film = 'The Dark Knight'
num_recommendations = 10

recommendations = get_recommendations(film, num_recommendations)
if recommendations is not None:
    print(f'{num_recommendations} film rekomendasi yang mirip dengan {film}: \n')
    display(recommendations[['title', 'combined_clean']])

Precision: 1.00

10 film rekomendasi yang mirip dengan The Dark Knight:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
import numpy as np

colbf = coll_bf.head(1500000)

# Membuat matriks pivot user-item
user_movie_matrix = colbf.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Menggunakan TruncatedSVD untuk dekomposisi matriks
svd = TruncatedSVD(n_components=12, random_state=42)
matrix_svd = svd.fit_transform(user_movie_matrix)

RMSE: 0.2879955912967949

# Menampilkan hasil rekomendasi
user_id = 379
num_recommendations = 10
recommended_movies_svd = recommend_movies_svd(user_id, num_recommendations)
print(f"Rekomendasi untuk pengguna dengan id {user_id}:")
recommended_movies_svd

Rekomendasi untuk pengguna dengan id 379:

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy

	userId	movieId	rating	timestamp
0	1	2	3.5	2005-04-02 23:53:47
1	1	29	3.5	2005-04-02 23:31:16
2	1	32	3.5	2005-04-02 23:33:39
3	1	47	3.5	2005-04-02 23:32:07
4	1	50	3.5	2005-04-02 23:29:40

	userId	movieId	tag	timestamp
0	18	4141	Mark Waters	2009-04-24 18:19:40
1	65	208	dark hero	2013-05-10 01:41:18
2	65	353	dark hero	2013-05-10 01:41:19
3	65	521	noir thriller	2013-05-10 01:39:43
4	65	592	dark hero	2013-05-10 01:41:18

	movieId	title	genres	tag
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy	Watched computer animation Disney animated fea...
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy	time travel adapted from:book board game child...
2	3	Grumpier Old Men (1995)	Comedy\|Romance	old people that is actually funny sequel fever...
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance	chick flick revenge characters chick flick cha...
4	5	Father of the Bride Part II (1995)	Comedy	Diane Keaton family sequel Steve Martin weddin...

	title	combined_features
0	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy Wa...
1	Jumanji (1995)	Adventure\|Children\|Fantasy time travel adapted...
2	Grumpier Old Men (1995)	Comedy\|Romance old people that is actually fun...
3	Waiting to Exhale (1995)	Comedy\|Drama\|Romance chick flick revenge chara...
4	Father of the Bride Part II (1995)	Comedy Diane Keaton family sequel Steve Martin...

Recommendation System Movie Lens¶

Import Library¶

Setup Base¶

Load Dataset¶

Exploratory Data Analysis¶

Total film yang diproduksi¶

Tahun minimal dan maksimal produksi film¶

Jumlah User (Konsumen)¶

Distribusi Genres¶

Distribusi Rating¶

Tren Rating Berdasarkan Waktu¶

Data Preprocessing¶

Content Based Filtering¶

Collaborative Based Filtering¶

Data Preparation¶

Normalization¶

Model Development dengan Content Based Filtering¶

Model Development dengan Collaborative Based Filtering¶

Evaluasi Model dengan RMSE¶

	userId	movieId	rating	year_month	title	genres	year
0	1	2	3.5	2005-04	Jumanji (1995)	Adventure\|Children\|Fantasy	1995
1	1	29	3.5	2005-04	City of Lost Children, The (Cité des enfants p...	Adventure\|Drama\|Fantasy\|Mystery\|Sci-Fi	1995
2	1	32	3.5	2005-04	Twelve Monkeys (a.k.a. 12 Monkeys) (1995)	Mystery\|Sci-Fi\|Thriller	1995
3	1	47	3.5	2005-04	Seven (a.k.a. Se7en) (1995)	Mystery\|Thriller	1995
4	1	50	3.5	2005-04	Usual Suspects, The (1995)	Crime\|Mystery\|Thriller	1995

	title	combined_clean
20307	Batman: The Dark Knight Returns, Part 2 (2013)	action animation le 300 rating action packed a...
18090	Batman: Year One (2011)	action animation crime batman le 300 rating ba...
21255	Batman Unmasked: The Psychology of the Dark Kn...	documentary batman film psychology batman
3126	Batman: Mask of the Phantasm (1993)	animation child adapted fromcomic alter ego ba...
12897	Batman: Gotham Knight (2008)	action animation crime batman anime batman goo...
21462	Batman: Mystery of the Batwoman (2003)	action animation child crime le 300 rating bas...
15566	Batman: Under the Red Hood (2010)	action animation animation antihero batman com...
20891	Justice League: Doom (2012)	action animation fantasy lauren montgomery bat...
8646	Batman (1966)	action adventure comedy adapted fromcomic alte...
21189	LEGO Batman: The Movie - DC Heroes Unite (2013)	action adventure animation le 300 rating anima...

	title	genres
14247	Kinsey (2004)	Drama
21704	Zorro, the Gay Blade (1981)	Comedy
21968	Toy Soldiers (1991)	Action\|Drama
22007	Megaforce (1982)	Action\|Sci-Fi
26818	Sure Thing, The (1985)	Comedy\|Romance
96607	Hour of the Wolf (Vargtimmen) (1968)	Drama\|Horror
107778	C.H.U.D. (1984)	Horror
181830	Spirits of the Dead (1968)	Horror\|Mystery
641679	Macabre (1958)	Horror\|Thriller