import pandas as pd # data management
import numpy as np # data management
import re # regular expressions
import nltk # text preprocessing
from nltk.tokenize import word_tokenize # tokenization
from nltk.corpus import stopwords # stopwords 
from nltk.stem import WordNetLemmatizer # lemmatization
from spellchecker import SpellChecker # spellcheck
from sklearn.feature_extraction.text import CountVectorizer # text vectorization
from sklearn.model_selection import train_test_split # train test split
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.tree import DecisionTreeClassifier # decision tree
from sklearn.ensemble import RandomForestClassifier # random forest
from xgboost import XGBClassifier # xgboost 
from sklearn.decomposition import PCA # principal component analysis
from sklearn.preprocessing import StandardScaler # z score normalization
from imblearn.over_sampling import SMOTE # synthetic minority oversampling technique
from collections import Counter # counting target class observations
from sklearn.preprocessing import LabelEncoder # convert categorical labels to numeric
from sklearn.svm import LinearSVC # linear svm
from sklearn.svm import SVC # non linear svm kernels
from sklearn import metrics # used for confusion matrix
import matplotlib.pyplot as plt # used to display confusion matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # evaluation metrics 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download("words")

[nltk_data] Downloading package punkt to C:\Users\Liam
[nltk_data]     Frank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Liam
[nltk_data]     Frank\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Liam
[nltk_data]     Frank\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Liam
[nltk_data]     Frank\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!

True

data = pd.read_csv('ecommerceDataset.csv') # load in dataset

na_counts = data.isna().sum().sum() # NaN obsrvation count
print('NaN observations:',na_counts) # print count
print(f"Number of samples: {len(data)}") # length before dropping observations with NaN value
X= data.dropna() # drop
print(f"Number of samples: {len(X)}") # length after dropping

NaN observations: 1
Number of samples: 50424
Number of samples: 50423

X = pd.DataFrame(X) # assigning dataframe type
X.rename(columns={X.columns[0]: 'Target'}, inplace=True) # naming target column
X.rename(columns={X.columns[1]: 'Text'}, inplace=True) # naming text feature column
print(X.head()) # print head of dataframe

      Target                                               Text
0  Household  SAF 'Floral' Framed Painting (Wood, 30 inch x ...
1  Household  SAF 'UV Textured Modern Art Print Framed' Pain...
2  Household  SAF Flower Print Framed Painting (Synthetic, 1...
3  Household  Incredible Gifts India Wooden Happy Birthday U...
4  Household  Pitaara Box Romantic Venice Canvas Painting 6m...

def remove_punctuation(text): # function to remove punctuation from text 
    words = word_tokenize(text)  
    words_without_punct = [word for word in words if word.isalnum()]  
    clean_text = ' '.join(words_without_punct)  
    return clean_text
X['Cleaned_Text'] = X['Text'].apply(remove_punctuation)
print('action complete')

action complete

X['Cleaned_Text'] = X['Cleaned_Text'].str.lower() # converting all text to lowercase
print('action complete')

action complete

def remove_stop_words(text): # function to remove stopwords from text
    stop_words = set(stopwords.words('english'))  
    words = word_tokenize(text)  
    filtered_words = [word for word in words if word.lower() not in stop_words]
    clean_text_without_stopwords = ' '.join(filtered_words)  
    return clean_text_without_stopwords

X['Stop_Text'] = X['Cleaned_Text'].apply(remove_stop_words)
print('action complete')

action complete

X.to_csv('StopText.csv', index=False) # writing cleaned stop text to csv file
print('action complete')

def spell_check(text): # function to spellcheck text (note runtime, this step is skipped over)
    spell = SpellChecker()  
    words = text.split()  
    corrected_text = ' '.join(list(spell.candidates(word))[0] if spell.candidates(word) else word for word in words)
    return corrected_text

for index, text in enumerate(X['Stop_Text']):
    X.at[index, 'Spell_Text'] = spell_check(text)  
    print(f"Processing observation number: {index + 1} out of {len(X)}")
print('action complete')

X = pd.read_csv('StopText.csv') # opening cleaned stop text csv
print(X.head())

      Target                                               Text  \
0  Household  SAF 'Floral' Framed Painting (Wood, 30 inch x ...   
1  Household  SAF 'UV Textured Modern Art Print Framed' Pain...   
2  Household  SAF Flower Print Framed Painting (Synthetic, 1...   
3  Household  Incredible Gifts India Wooden Happy Birthday U...   
4  Household  Pitaara Box Romantic Venice Canvas Painting 6m...   

                                        Cleaned_Text  \
0  saf framed painting wood 30 inch x 10 inch spe...   
1  saf textured modern art print framed painting ...   
2  saf flower print framed painting synthetic inc...   
3  incredible gifts india wooden happy birthday u...   
4  pitaara box romantic venice canvas painting 6m...   

                                           Stop_Text  
0  saf framed painting wood 30 inch x 10 inch spe...  
1  saf textured modern art print framed painting ...  
2  saf flower print framed painting synthetic inc...  
3  incredible gifts india wooden happy birthday u...  
4  pitaara box romantic venice canvas painting 6m...

print(f"Number of training samples: {len(X)}") # length of dataframe
X = X[X['Stop_Text'].apply(lambda x: type(x) == str)] # removing singular NaN observation
print(f"Number of training samples: {len(X)}") # length after removing NaN observation

Number of training samples: 50423
Number of training samples: 50422

def lemmatize_text(text): # function to lemmatize text 
    lemmatizer = WordNetLemmatizer()  
    words = word_tokenize(text)  
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]  
    lemmatized_text = ' '.join(lemmatized_words)  
    return lemmatized_text
    
X['Lemm_Text'] = X['Stop_Text'].apply(lemmatize_text)
print('action complete')

action complete

Xf = X[['Lemm_Text','Target']].copy() # creating seperate dataframe with lemmatized text and target variable
print(Xf) # printing head of new dataframe

                                               Lemm_Text       Target
0      saf framed painting wood 30 inch x 10 inch spe...    Household
1      saf textured modern art print framed painting ...    Household
2      saf flower print framed painting synthetic inc...    Household
3      incredible gift india wooden happy birthday un...    Household
4      pitaara box romantic venice canvas painting 6m...    Household
...                                                  ...          ...
50418  strontium microsd class 10 8gb memory card bla...  Electronics
50419  crossbeats wave waterproof bluetooth wireless ...  Electronics
50420  karbonn titanium wind w4 white karbonn titaniu...  Electronics
50421  samsung guru fm plus black colour black compac...  Electronics
50422                     micromax canvas win w121 white  Electronics

[50422 rows x 2 columns]

# series of functions to remove non-english words and numerical values from text
valid_english_words = set(nltk.corpus.words.words())

def is_valid_english_word(token):
    """Check if the token is a valid English word."""
    return token.lower() in valid_english_words

def is_english_word(token):
    """Check if the token consists only of English alphabetic characters."""
    return bool(re.match("^[a-zA-Z]+$", token))

Xf['English_Words'] = Xf['Lemm_Text'].apply(
    lambda text: ' '.join(
        [word for word in word_tokenize(text) 
         if is_english_word(word) and is_valid_english_word(word)]
    )
)

print('action complete')

action complete

vectorizer = CountVectorizer(ngram_range=(1,1)) # 1-gram text vectorization
X_bow = vectorizer.fit_transform(Xf['English_Words']) # fitting vectorizer 
#bow_df = pd.DataFrame.sparse.from_spmatrix(X_bow, columns=vectorizer.get_feature_names_out()) # creating new dataframe for vectorized text
# Assuming X_bow is a sparse matrix (e.g., from CountVectorizer or TfidfVectorizer)
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
dimensions = bow_df.shape
print(dimensions) # printing new dataframe dimensions, 20818 unique words

(50422, 20818)

scaler = StandardScaler(with_mean=False) # z score normalization
data_scaled = scaler.fit_transform(bow_df)
pca = PCA(n_components=1000) # principal component analysis
pca_components = pca.fit_transform(data_scaled)
total_variance_retained = sum(pca.explained_variance_ratio_)
print("Total Retained Variance:", total_variance_retained * 100) # percentage of variance retained from original 20,000 dimensions

Total Retained Variance: 44.13521491768359

pca_df = pd.DataFrame(pca_components) # new features or principal components
print(pca_df.head()) # printing head of new features

        0         1         2         3         4         5         6    \
0 -1.393164 -0.300634 -0.267478 -0.886341 -1.489242 -0.335877  0.071470   
1  2.463593 -0.356065  0.607053  0.610923  0.482637  0.350160 -0.300589   
2  1.870774 -0.214437  0.535417  0.349024  0.091995  0.334822 -0.211499   
3  0.198192  0.824569  1.164338  0.347125 -0.078132 -0.097122 -0.299513   
4  0.431436 -0.262142 -0.055239  0.945775  0.541915 -0.211355 -0.351137   

        7         8         9    ...       990       991       992       993  \
0 -0.072027 -0.530098  0.298228  ... -0.321108 -0.388467  0.082747  0.257667   
1  0.489495  1.027533 -0.381123  ... -1.640848 -1.098713  2.024393  1.961074   
2  0.459393  0.635908 -0.413404  ... -1.086066 -0.564310  1.402240  1.398825   
3 -0.120107  0.208456 -0.398077  ... -1.577159 -0.576771 -0.179925  0.281563   
4  0.210196  0.533725  0.026349  ... -0.858676 -0.627954  2.160582 -0.469683   

        994       995       996       997       998       999  
0  0.138718 -0.195835 -0.008370 -0.447965 -0.292944 -0.237789  
1  1.341667 -0.731678 -0.044773 -1.650160 -0.212247 -0.213103  
2  0.864504 -0.942004 -0.167722 -1.328372 -0.373653 -0.021119  
3  0.351640  1.898000  0.213533  0.114439 -0.704059  0.579606  
4  0.817006 -1.671696  0.961320 -1.553002 -0.158825  0.483861  

[5 rows x 1000 columns]

finaldf = pd.concat([pca_df, Xf['Target']], axis=1) # concatenated dataframe of principal components and target class
print("action complete")
dimensions = finaldf.shape
print(dimensions) # dimensions of final cleaned and vectorized nlp dataframe

action complete
(50423, 1001)

finaldf = finaldf.drop(index=50422) # droping observation 50422
total_na_values = finaldf.isna().sum().sum()
print("Total number of NaN values:", total_na_values) # printing number of NaN values
finaldf = finaldf.dropna() # droping singular NaN value
total_na_values = finaldf.isna().sum().sum()
print("Total number of NaN values:", total_na_values) # printing number of NaN values post drop

Total number of NaN values: 1
Total number of NaN values: 0

class_counts = finaldf['Target'].value_counts() # displaying class imbalance
print(class_counts)

Target
Household                 19312
Books                     11819
Electronics               10620
Clothing & Accessories     8670
Name: count, dtype: int64

X = finaldf.drop(columns=['Target']) # model input features
y = finaldf['Target'] # classification target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10000) # train test split of roughly 80/20
print('action complete')

action complete

smote = SMOTE(random_state=42) # smote for balancing classes 
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(f"Resampled class distribution: {Counter(y_resampled)}") # printing balance class distribution

X_train = pd.DataFrame(X_resampled, columns=X.columns) # new training dataframe of balanced classes
y_train = y_resampled
print('action complete')

Resampled class distribution: Counter({'Household': 15460, 'Books': 15460, 'Electronics': 15460, 'Clothing & Accessories': 15460})
action complete

nb_model = GaussianNB() # naive bayes
nb_model.fit(X_train.values, y_train) 
y_pred = nb_model.predict(X_test.values) 

accuracy = accuracy_score(y_test, y_pred) * 100 
precision = precision_score(y_test, y_pred, average='macro') 
recall = recall_score(y_test, y_pred, average='macro') 
f1 = f1_score(y_test, y_pred, average='macro')
print("Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1 Score: {:.2f}".format(accuracy, precision, recall, f1))

confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = np.unique(y_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = labels)
cm_display.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=45)
plt.title("Naive Bayes")
plt.show()

Accuracy: 29.76, Precision: 0.41, Recall: 0.36, F1 Score: 0.28

clf = DecisionTreeClassifier() # decision tree
clf.fit(X_train.values, y_train)
y_pred = clf.predict(X_test.values)

accuracy = accuracy_score(y_test, y_pred) * 100 
precision = precision_score(y_test, y_pred, average='macro') 
recall = recall_score(y_test, y_pred, average='macro') 
f1 = f1_score(y_test, y_pred, average='macro')
print("Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1 Score: {:.2f}".format(accuracy, precision, recall, f1))

confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = np.unique(y_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = labels)
cm_display.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=45)
plt.title("Decision Tree")
plt.show()

Accuracy: 93.33, Precision: 0.93, Recall: 0.93, F1 Score: 0.93

rf_model = RandomForestClassifier(n_estimators=200) # random forest
rf_model.fit(X_train.values, y_train) 
y_pred = rf_model.predict(X_test.values)

accuracy = accuracy_score(y_test, y_pred) * 100 
precision = precision_score(y_test, y_pred, average='macro') 
recall = recall_score(y_test, y_pred, average='macro') 
f1 = f1_score(y_test, y_pred, average='macro')
print("Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1 Score: {:.2f}".format(accuracy, precision, recall, f1))

confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = np.unique(y_test)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = labels)
cm_display.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=45)
plt.title("Random Forest")
plt.show()

Accuracy: 96.86, Precision: 0.97, Recall: 0.97, F1 Score: 0.97

label_encoder = LabelEncoder() # encoding target variable classes 
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

xgb_model = XGBClassifier(eval_metric='mlogloss') # xgboost
xgb_model.fit(X_train.values, y_train_encoded)
xgb_pred = xgb_model.predict(X_test.values)

accuracy = accuracy_score(y_test_encoded, xgb_pred) * 100
precision = precision_score(y_test_encoded, xgb_pred, average='macro') 
recall = recall_score(y_test_encoded, xgb_pred, average='macro') 
f1 = f1_score(y_test_encoded, xgb_pred, average='macro')
print("Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1 Score: {:.2f}".format(accuracy, precision, recall, f1))

confusion_matrix = metrics.confusion_matrix(y_test_encoded, xgb_pred)
labels = np.unique(y_test_encoded)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = labels)
cm_display.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=45)
plt.title("XGBoost")
plt.show()

Accuracy: 97.34, Precision: 0.98, Recall: 0.97, F1 Score: 0.97

booster = xgb_model.get_booster() # number of trees in XGBoost model
num_trees = booster.trees_to_dataframe()['Tree'].nunique()
print(num_trees)

400

importance_scores = xgb_model.feature_importances_ # finding principal components most influencing classification in XGboost model
feature_names = X_train.columns

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance_scores
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df.head(5))

    Feature  Importance
19       19    0.071901
34       34    0.048317
57       57    0.042197
132     132    0.021426
127     127    0.016457

feature_names = bow_df.columns # finding origianl text vectorized words with highest loading values in PC19
pc19_loadings = pca.components_[19]
pc19_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'PC19 Loading': pc19_loadings,
    'PC19 Importance': np.abs(pc19_loadings)
})

pc19_importance_df = pc19_importance_df.sort_values(by='PC19 Importance', ascending=False)
print(pc19_importance_df.head(10))

             Feature  PC19 Loading  PC19 Importance
943    argumentation      0.087237         0.087237
11899       naysayer      0.087237         0.087237
9282   inexhaustible      0.087237         0.087237
20642      worrisome      0.087237         0.087237
19494  unimpeachable      0.087237         0.087237
10648      loquently      0.087237         0.087237
18967           trey      0.087237         0.087237
18071      syndicate      0.087237         0.087237
13584          pogge      0.087237         0.087237
19416        unequal      0.087237         0.087237

def train_and_evaluate_svm(kernel_name): 
    print(f'\nTraining and evaluating SVM with {kernel_name} kernel...')
    svm_model = SVC(kernel=kernel_name) 
    svm_model.fit(X_train.values, y_train) 

    y_pred = svm_model.predict(X_test.values)

    accuracy = accuracy_score(y_test, y_pred) * 100 
    precision = precision_score(y_test, y_pred, average='macro') 
    recall = recall_score(y_test, y_pred, average='macro') 
    f1 = f1_score(y_test, y_pred, average='macro')
    print("Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1 Score: {:.2f}".format(accuracy, precision, recall, f1))

    confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
    labels = np.unique(y_test)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = labels)
    cm_display.plot(cmap=plt.cm.Blues)
    plt.xticks(rotation=45)
    plt.title(f'\nSupport Vector Machine with {kernel_name} kernel...')
    plt.show() 
    
    
train_and_evaluate_svm('linear')
train_and_evaluate_svm('poly')
train_and_evaluate_svm('rbf')
train_and_evaluate_svm('sigmoid')

Training and evaluating SVM with linear kernel...
Accuracy: 93.56, Precision: 0.93, Recall: 0.94, F1 Score: 0.94

Training and evaluating SVM with poly kernel...
Accuracy: 29.92, Precision: 0.79, Recall: 0.37, F1 Score: 0.29

Training and evaluating SVM with rbf kernel...
Accuracy: 89.93, Precision: 0.90, Recall: 0.90, F1 Score: 0.90

Training and evaluating SVM with sigmoid kernel...
Accuracy: 87.47, Precision: 0.88, Recall: 0.88, F1 Score: 0.88