import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


diamond_df = pd.read_csv('Diamonds_Prices.csv')
diamond_df.head()


diamond_df.describe()


diamond_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53943 entries, 0 to 53942
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53943 non-null  int64  
 1   carat       53943 non-null  float64
 2   cut         53943 non-null  object 
 3   color       53943 non-null  object 
 4   clarity     53943 non-null  object 
 5   depth       53943 non-null  float64
 6   table       53943 non-null  float64
 7   price       53943 non-null  int64  
 8   x           53943 non-null  float64
 9   y           53943 non-null  float64
 10  z           53943 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 4.5+ MB


diamond_df.drop('Unnamed: 0', axis=1, inplace=True)


diamond_df['cut'].value_counts()

cut
Ideal        21551
Premium      13793
Very Good    12083
Good          4906
Fair          1610
Name: count, dtype: int64


# Check for any duplicate values
diamond_df.duplicated().sum()

149


# Check for any null values
diamond_df.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64


plt.figure(figsize=(8,5))
sns.countplot(data=diamond_df, x="cut", palette="dark")

plt.title("Data Distribution of Diamond Cuts")
plt.show()


plt.figure(figsize=(8,5))
sns.countplot(data=diamond_df, x="color", palette="dark")

plt.title("Data Distribution of Diamond Colors")
plt.show()


plt.figure(figsize=(8,5))
sns.countplot(data=diamond_df, x="clarity", palette="dark")

plt.title("Data Distribution of Diamond Colors")
plt.show()


diamond_df.hist(figsize=(30,30), edgecolor='white')
plt.show()


# Import relevant Machine Learning libraries
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer


# Encode the categorical variables ordinally
diamond_df['cut'] = diamond_df['cut'].map({'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4})
diamond_df['color'] = diamond_df['color'].map({'J':0, 'I':1, 'H':2, 'G':3, 'F':4, 'E':5, 'D':6})
diamond_df['clarity'] = diamond_df['clarity'].map({'I1':0, 'SI2':1, 'SI1':2, 'VS2':3, 'VS1':4, 'VVS2':5, 'VVS1':6, 'IF':7})


# Assign independent (x) and dependent (y) variables
x = diamond_df[['carat', 'cut', 'color', 'clarity', 'x', 'y', 'z']]
y = diamond_df[['price']]


# Data Preprocessing
scaler = RobustScaler()
x = scaler.fit_transform(x)


# Split the dataset into train-test set
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=.2,shuffle=True)


# Apply random forest prediction
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(x_train,y_train)

RandomForestRegressor()

RandomForestRegressor()


randfor_pred = random_forest.predict(x_test)


r2_score(y_test,randfor_pred)

0.9813881897340959


mean_absolute_error(y_test,randfor_pred)

272.578972474169


mean_squared_error(y_test,randfor_pred)

291984.9172608294

	Unnamed: 0	carat	cut	color	clarity	depth	table	price	x	y	z
0	1	0.23	Ideal	E	SI2	61.5	55.0	326	3.95	3.98	2.43
1	2	0.21	Premium	E	SI1	59.8	61.0	326	3.89	3.84	2.31
2	3	0.23	Good	E	VS1	56.9	65.0	327	4.05	4.07	2.31
3	4	0.29	Premium	I	VS2	62.4	58.0	334	4.20	4.23	2.63
4	5	0.31	Good	J	SI2	63.3	58.0	335	4.34	4.35	2.75

	Unnamed: 0	carat	depth	table	price	x	y	z
count	53943.000000	53943.000000	53943.000000	53943.000000	53943.000000	53943.000000	53943.000000	53943.000000
mean	26972.000000	0.797935	61.749322	57.457251	3932.734294	5.731158	5.734526	3.538730
std	15572.147122	0.473999	1.432626	2.234549	3989.338447	1.121730	1.142103	0.705679
min	1.000000	0.200000	43.000000	43.000000	326.000000	0.000000	0.000000	0.000000
25%	13486.500000	0.400000	61.000000	56.000000	950.000000	4.710000	4.720000	2.910000
50%	26972.000000	0.700000	61.800000	57.000000	2401.000000	5.700000	5.710000	3.530000
75%	40457.500000	1.040000	62.500000	59.000000	5324.000000	6.540000	6.540000	4.040000
max	53943.000000	5.010000	79.000000	95.000000	18823.000000	10.740000	58.900000	31.800000

Diamond Price Prediction using Random Forest