In [2]:
import pandas as pd # data management
import numpy as np # data management
import re # regular expressions
import nltk # text preprocessing
from nltk.tokenize import word_tokenize # tokenization
from nltk.corpus import stopwords # stopwords
from nltk.stem import WordNetLemmatizer # lemmatization
from spellchecker import SpellChecker # spellcheck
from sklearn.feature_extraction.text import CountVectorizer # text vectorization
from sklearn.model_selection import train_test_split # train test split
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.tree import DecisionTreeClassifier # decision tree
from sklearn.ensemble import RandomForestClassifier # random forest
from xgboost import XGBClassifier # xgboost
from sklearn.decomposition import PCA # principal component analysis
from sklearn.preprocessing import StandardScaler # z score normalization
from imblearn.over_sampling import SMOTE # synthetic minority oversampling technique
from collections import Counter # counting target class observations
from sklearn.preprocessing import LabelEncoder # convert categorical labels to numeric
from sklearn.svm import LinearSVC # linear svm
from sklearn.svm import SVC # non linear svm kernels
from sklearn import metrics # used for confusion matrix
import matplotlib.pyplot as plt # used to display confusion matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # evaluation metrics
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download("words")
[nltk_data] Downloading package punkt to C:\Users\Liam [nltk_data] Frank\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to C:\Users\Liam [nltk_data] Frank\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to C:\Users\Liam [nltk_data] Frank\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package words to C:\Users\Liam [nltk_data] Frank\AppData\Roaming\nltk_data... [nltk_data] Package words is already up-to-date!
Out[2]:
True
In [2]:
data = pd.read_csv('ecommerceDataset.csv') # load in dataset
In [3]:
na_counts = data.isna().sum().sum() # NaN obsrvation count
print('NaN observations:',na_counts) # print count
print(f"Number of samples: {len(data)}") # length before dropping observations with NaN value
X= data.dropna() # drop
print(f"Number of samples: {len(X)}") # length after dropping
NaN observations: 1 Number of samples: 50424 Number of samples: 50423
In [131]:
X = pd.DataFrame(X) # assigning dataframe type
X.rename(columns={X.columns[0]: 'Target'}, inplace=True) # naming target column
X.rename(columns={X.columns[1]: 'Text'}, inplace=True) # naming text feature column
print(X.head()) # print head of dataframe
Target Text 0 Household SAF 'Floral' Framed Painting (Wood, 30 inch x ... 1 Household SAF 'UV Textured Modern Art Print Framed' Pain... 2 Household SAF Flower Print Framed Painting (Synthetic, 1... 3 Household Incredible Gifts India Wooden Happy Birthday U... 4 Household Pitaara Box Romantic Venice Canvas Painting 6m...
In [132]:
def remove_punctuation(text): # function to remove punctuation from text
words = word_tokenize(text)
words_without_punct = [word for word in words if word.isalnum()]
clean_text = ' '.join(words_without_punct)
return clean_text
X['Cleaned_Text'] = X['Text'].apply(remove_punctuation)
print('action complete')
action complete
In [133]:
X['Cleaned_Text'] = X['Cleaned_Text'].str.lower() # converting all text to lowercase
print('action complete')
action complete
In [134]:
def remove_stop_words(text): # function to remove stopwords from text
stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
filtered_words = [word for word in words if word.lower() not in stop_words]
clean_text_without_stopwords = ' '.join(filtered_words)
return clean_text_without_stopwords
X['Stop_Text'] = X['Cleaned_Text'].apply(remove_stop_words)
print('action complete')
action complete
In [ ]:
X.to_csv('StopText.csv', index=False) # writing cleaned stop text to csv file
print('action complete')
In [ ]:
def spell_check(text): # function to spellcheck text (note runtime, this step is skipped over)
spell = SpellChecker()
words = text.split()
corrected_text = ' '.join(list(spell.candidates(word))[0] if spell.candidates(word) else word for word in words)
return corrected_text
for index, text in enumerate(X['Stop_Text']):
X.at[index, 'Spell_Text'] = spell_check(text)
print(f"Processing observation number: {index + 1} out of {len(X)}")
print('action complete')
In [3]:
X = pd.read_csv('StopText.csv') # opening cleaned stop text csv
print(X.head())
Target Text \
0 Household SAF 'Floral' Framed Painting (Wood, 30 inch x ...
1 Household SAF 'UV Textured Modern Art Print Framed' Pain...
2 Household SAF Flower Print Framed Painting (Synthetic, 1...
3 Household Incredible Gifts India Wooden Happy Birthday U...
4 Household Pitaara Box Romantic Venice Canvas Painting 6m...
Cleaned_Text \
0 saf framed painting wood 30 inch x 10 inch spe...
1 saf textured modern art print framed painting ...
2 saf flower print framed painting synthetic inc...
3 incredible gifts india wooden happy birthday u...
4 pitaara box romantic venice canvas painting 6m...
Stop_Text
0 saf framed painting wood 30 inch x 10 inch spe...
1 saf textured modern art print framed painting ...
2 saf flower print framed painting synthetic inc...
3 incredible gifts india wooden happy birthday u...
4 pitaara box romantic venice canvas painting 6m...
In [4]:
print(f"Number of training samples: {len(X)}") # length of dataframe
X = X[X['Stop_Text'].apply(lambda x: type(x) == str)] # removing singular NaN observation
print(f"Number of training samples: {len(X)}") # length after removing NaN observation
Number of training samples: 50423 Number of training samples: 50422
In [5]:
def lemmatize_text(text): # function to lemmatize text
lemmatizer = WordNetLemmatizer()
words = word_tokenize(text)
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
lemmatized_text = ' '.join(lemmatized_words)
return lemmatized_text
X['Lemm_Text'] = X['Stop_Text'].apply(lemmatize_text)
print('action complete')
action complete
In [6]:
Xf = X[['Lemm_Text','Target']].copy() # creating seperate dataframe with lemmatized text and target variable
print(Xf) # printing head of new dataframe
Lemm_Text Target 0 saf framed painting wood 30 inch x 10 inch spe... Household 1 saf textured modern art print framed painting ... Household 2 saf flower print framed painting synthetic inc... Household 3 incredible gift india wooden happy birthday un... Household 4 pitaara box romantic venice canvas painting 6m... Household ... ... ... 50418 strontium microsd class 10 8gb memory card bla... Electronics 50419 crossbeats wave waterproof bluetooth wireless ... Electronics 50420 karbonn titanium wind w4 white karbonn titaniu... Electronics 50421 samsung guru fm plus black colour black compac... Electronics 50422 micromax canvas win w121 white Electronics [50422 rows x 2 columns]
In [7]:
# series of functions to remove non-english words and numerical values from text
valid_english_words = set(nltk.corpus.words.words())
def is_valid_english_word(token):
"""Check if the token is a valid English word."""
return token.lower() in valid_english_words
def is_english_word(token):
"""Check if the token consists only of English alphabetic characters."""
return bool(re.match("^[a-zA-Z]+$", token))
Xf['English_Words'] = Xf['Lemm_Text'].apply(
lambda text: ' '.join(
[word for word in word_tokenize(text)
if is_english_word(word) and is_valid_english_word(word)]
)
)
print('action complete')
action complete
In [10]:
vectorizer = CountVectorizer(ngram_range=(1,1)) # 1-gram text vectorization
X_bow = vectorizer.fit_transform(Xf['English_Words']) # fitting vectorizer
#bow_df = pd.DataFrame.sparse.from_spmatrix(X_bow, columns=vectorizer.get_feature_names_out()) # creating new dataframe for vectorized text
# Assuming X_bow is a sparse matrix (e.g., from CountVectorizer or TfidfVectorizer)
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
dimensions = bow_df.shape
print(dimensions) # printing new dataframe dimensions, 20818 unique words
(50422, 20818)
In [11]:
scaler = StandardScaler(with_mean=False) # z score normalization
data_scaled = scaler.fit_transform(bow_df)
pca = PCA(n_components=1000) # principal component analysis
pca_components = pca.fit_transform(data_scaled)
total_variance_retained = sum(pca.explained_variance_ratio_)
print("Total Retained Variance:", total_variance_retained * 100) # percentage of variance retained from original 20,000 dimensions
Total Retained Variance: 44.13521491768359
In [12]:
pca_df = pd.DataFrame(pca_components) # new features or principal components
print(pca_df.head()) # printing head of new features
0 1 2 3 4 5 6 \
0 -1.393164 -0.300634 -0.267478 -0.886341 -1.489242 -0.335877 0.071470
1 2.463593 -0.356065 0.607053 0.610923 0.482637 0.350160 -0.300589
2 1.870774 -0.214437 0.535417 0.349024 0.091995 0.334822 -0.211499
3 0.198192 0.824569 1.164338 0.347125 -0.078132 -0.097122 -0.299513
4 0.431436 -0.262142 -0.055239 0.945775 0.541915 -0.211355 -0.351137
7 8 9 ... 990 991 992 993 \
0 -0.072027 -0.530098 0.298228 ... -0.321108 -0.388467 0.082747 0.257667
1 0.489495 1.027533 -0.381123 ... -1.640848 -1.098713 2.024393 1.961074
2 0.459393 0.635908 -0.413404 ... -1.086066 -0.564310 1.402240 1.398825
3 -0.120107 0.208456 -0.398077 ... -1.577159 -0.576771 -0.179925 0.281563
4 0.210196 0.533725 0.026349 ... -0.858676 -0.627954 2.160582 -0.469683
994 995 996 997 998 999
0 0.138718 -0.195835 -0.008370 -0.447965 -0.292944 -0.237789
1 1.341667 -0.731678 -0.044773 -1.650160 -0.212247 -0.213103
2 0.864504 -0.942004 -0.167722 -1.328372 -0.373653 -0.021119
3 0.351640 1.898000 0.213533 0.114439 -0.704059 0.579606
4 0.817006 -1.671696 0.961320 -1.553002 -0.158825 0.483861
[5 rows x 1000 columns]
In [13]:
finaldf = pd.concat([pca_df, Xf['Target']], axis=1) # concatenated dataframe of principal components and target class
print("action complete")
dimensions = finaldf.shape
print(dimensions) # dimensions of final cleaned and vectorized nlp dataframe
action complete (50423, 1001)
In [14]:
finaldf = finaldf.drop(index=50422) # droping observation 50422
total_na_values = finaldf.isna().sum().sum()
print("Total number of NaN values:", total_na_values) # printing number of NaN values
finaldf = finaldf.dropna() # droping singular NaN value
total_na_values = finaldf.isna().sum().sum()
print("Total number of NaN values:", total_na_values) # printing number of NaN values post drop
Total number of NaN values: 1 Total number of NaN values: 0
In [15]:
class_counts = finaldf['Target'].value_counts() # displaying class imbalance
print(class_counts)
Target Household 19312 Books 11819 Electronics 10620 Clothing & Accessories 8670 Name: count, dtype: int64
In [16]:
X = finaldf.drop(columns=['Target']) # model input features
y = finaldf['Target'] # classification target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000) # train test split of roughly 80/20
print('action complete')
action complete
In [17]:
smote = SMOTE(random_state=42) # smote for balancing classes
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print(f"Resampled class distribution: {Counter(y_resampled)}") # printing balance class distribution
X_train = pd.DataFrame(X_resampled, columns=X.columns) # new training dataframe of balanced classes
y_train = y_resampled
print('action complete')
Resampled class distribution: Counter({'Household': 15460, 'Books': 15460, 'Electronics': 15460, 'Clothing & Accessories': 15460})
action complete
In [52]:
nb_model = GaussianNB() # naive bayes
nb_model.fit(X_train.values, y_train)
y_pred = nb_model.predict(X_test.values)
accuracy = accuracy_score(y_test, y_pred) * 100
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print("Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1 Score: {:.2f}".format(accuracy, precision, recall, f1))
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = np.unique(y_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = labels)
cm_display.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=45)
plt.title("Naive Bayes")
plt.show()
Accuracy: 29.76, Precision: 0.41, Recall: 0.36, F1 Score: 0.28
In [54]:
clf = DecisionTreeClassifier() # decision tree
clf.fit(X_train.values, y_train)
y_pred = clf.predict(X_test.values)
accuracy = accuracy_score(y_test, y_pred) * 100
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print("Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1 Score: {:.2f}".format(accuracy, precision, recall, f1))
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = np.unique(y_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = labels)
cm_display.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=45)
plt.title("Decision Tree")
plt.show()
Accuracy: 93.33, Precision: 0.93, Recall: 0.93, F1 Score: 0.93
In [55]:
rf_model = RandomForestClassifier(n_estimators=200) # random forest
rf_model.fit(X_train.values, y_train)
y_pred = rf_model.predict(X_test.values)
accuracy = accuracy_score(y_test, y_pred) * 100
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print("Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1 Score: {:.2f}".format(accuracy, precision, recall, f1))
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = np.unique(y_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = labels)
cm_display.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=45)
plt.title("Random Forest")
plt.show()
Accuracy: 96.86, Precision: 0.97, Recall: 0.97, F1 Score: 0.97
In [56]:
label_encoder = LabelEncoder() # encoding target variable classes
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
xgb_model = XGBClassifier(eval_metric='mlogloss') # xgboost
xgb_model.fit(X_train.values, y_train_encoded)
xgb_pred = xgb_model.predict(X_test.values)
accuracy = accuracy_score(y_test_encoded, xgb_pred) * 100
precision = precision_score(y_test_encoded, xgb_pred, average='macro')
recall = recall_score(y_test_encoded, xgb_pred, average='macro')
f1 = f1_score(y_test_encoded, xgb_pred, average='macro')
print("Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1 Score: {:.2f}".format(accuracy, precision, recall, f1))
confusion_matrix = metrics.confusion_matrix(y_test_encoded, xgb_pred)
labels = np.unique(y_test_encoded)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = labels)
cm_display.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=45)
plt.title("XGBoost")
plt.show()
Accuracy: 97.34, Precision: 0.98, Recall: 0.97, F1 Score: 0.97
In [69]:
booster = xgb_model.get_booster() # number of trees in XGBoost model
num_trees = booster.trees_to_dataframe()['Tree'].nunique()
print(num_trees)
400
In [67]:
importance_scores = xgb_model.feature_importances_ # finding principal components most influencing classification in XGboost model
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
'Feature': feature_names,
'Importance': importance_scores
})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df.head(5))
Feature Importance 19 19 0.071901 34 34 0.048317 57 57 0.042197 132 132 0.021426 127 127 0.016457
In [75]:
feature_names = bow_df.columns # finding origianl text vectorized words with highest loading values in PC19
pc19_loadings = pca.components_[19]
pc19_importance_df = pd.DataFrame({
'Feature': feature_names,
'PC19 Loading': pc19_loadings,
'PC19 Importance': np.abs(pc19_loadings)
})
pc19_importance_df = pc19_importance_df.sort_values(by='PC19 Importance', ascending=False)
print(pc19_importance_df.head(10))
Feature PC19 Loading PC19 Importance 943 argumentation 0.087237 0.087237 11899 naysayer 0.087237 0.087237 9282 inexhaustible 0.087237 0.087237 20642 worrisome 0.087237 0.087237 19494 unimpeachable 0.087237 0.087237 10648 loquently 0.087237 0.087237 18967 trey 0.087237 0.087237 18071 syndicate 0.087237 0.087237 13584 pogge 0.087237 0.087237 19416 unequal 0.087237 0.087237
In [57]:
def train_and_evaluate_svm(kernel_name):
print(f'\nTraining and evaluating SVM with {kernel_name} kernel...')
svm_model = SVC(kernel=kernel_name)
svm_model.fit(X_train.values, y_train)
y_pred = svm_model.predict(X_test.values)
accuracy = accuracy_score(y_test, y_pred) * 100
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print("Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1 Score: {:.2f}".format(accuracy, precision, recall, f1))
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = np.unique(y_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = labels)
cm_display.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=45)
plt.title(f'\nSupport Vector Machine with {kernel_name} kernel...')
plt.show()
train_and_evaluate_svm('linear')
train_and_evaluate_svm('poly')
train_and_evaluate_svm('rbf')
train_and_evaluate_svm('sigmoid')
Training and evaluating SVM with linear kernel... Accuracy: 93.56, Precision: 0.93, Recall: 0.94, F1 Score: 0.94
Training and evaluating SVM with poly kernel... Accuracy: 29.92, Precision: 0.79, Recall: 0.37, F1 Score: 0.29
Training and evaluating SVM with rbf kernel... Accuracy: 89.93, Precision: 0.90, Recall: 0.90, F1 Score: 0.90
Training and evaluating SVM with sigmoid kernel... Accuracy: 87.47, Precision: 0.88, Recall: 0.88, F1 Score: 0.88