from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

! pip install pandasql # to use SQL
import pandasql # for SQL queries
import pandas as pd # for data analysis
from sklearn.tree import DecisionTreeClassifier, plot_tree # for decision tree
from sklearn.model_selection import cross_val_score # for cross validation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report # to evaluate model
import matplotlib.pyplot as plt # for plots

Requirement already satisfied: pandasql in /usr/local/lib/python3.12/dist-packages (0.7.3)
Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from pandasql) (2.0.2)
Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (from pandasql) (2.2.2)
Requirement already satisfied: sqlalchemy in /usr/local/lib/python3.12/dist-packages (from pandasql) (2.0.44)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas->pandasql) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas->pandasql) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas->pandasql) (2025.2)
Requirement already satisfied: greenlet>=1 in /usr/local/lib/python3.12/dist-packages (from sqlalchemy->pandasql) (3.2.4)
Requirement already satisfied: typing-extensions>=4.6.0 in /usr/local/lib/python3.12/dist-packages (from sqlalchemy->pandasql) (4.15.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas->pandasql) (1.17.0)

! unzip '/content/drive/MyDrive/NSU - MS - Data Analytics and AI/MMIS 0692 - Data Analytics and AI Project/Task 2/data.MMIS692.Fall2025.zip'
defective = pd.read_csv('quality_control.defective.csv')
measurements = pd.read_csv('quality_control.measurements.csv')
new_batches = pd.read_csv('quality_control.new_batches.csv')
! rm *.csv

Archive:  /content/drive/MyDrive/NSU - MS - Data Analytics and AI/MMIS 0692 - Data Analytics and AI Project/Task 2/data.MMIS692.Fall2025.zip
  inflating: quality_control.new_batches.csv  
  inflating: quality_control.measurements.csv  
  inflating: quality_control.defective.csv  
  inflating: production_planning.resource.csv  
  inflating: production_planning.product.csv  
  inflating: customer_segmentation.unlabeled.csv  
  inflating: customer_segmentation.valid.csv  
  inflating: customer_segmentation.train.csv

print("Dataframe defective contains %d rows and %d columns" %defective.shape)
defective.head(10) # display first 10 rows of defective

Dataframe defective contains 11130 rows and 2 columns

print("Dataframe measurements contains %d rows and %d columns" %measurements.shape)
measurements.head() # display first 5 rows of measurements

Dataframe measurements contains 5000 rows and 31 columns

# define a string q with the SQL query to identify poor quality bacthers
q = """
select batchID
from defective
group by batchID
having count(*) > 1;"""

poor_batches_SQL = pandasql.sqldf(q).values # evaluate query to get poor quality batches

print("Number of poor quality batches = %d" %len(poor_batches_SQL))

# assign labels to each batch in the dataframe measurements
labels = ['poor' if b in poor_batches_SQL else 'good'
          for b in measurements.batchID] # assign labels for batches

# create dataframe with quality of batches labeled 'poor' or 'good'
quality_SQL = pd.DataFrame(zip(measurements.batchID, labels), columns=['batchID', 'quality'])
quality_SQL.head(10).to_csv('quality_SQL.csv', index=False) # save results for first 10 batches
print("Quality of first 10 batches")
quality_SQL.head(10)

Number of poor quality batches = 2224
Quality of first 10 batches

g = defective.groupby('batchID') # group defective items by batches
poor_batches = defective.groupby('batchID').filter(lambda x: len(x)>1).batchID.unique() # batches with more than 1 defective items

print("Number of poor quality batches = %d" %len(poor_batches))

labels = ['poor' if b in poor_batches else 'good' for b in measurements.batchID] # labels for batches

# create dataframe with quality of batches labeled 'poor' or 'good'
quality = pd.DataFrame(zip(measurements.batchID, labels), columns=['batchID', 'quality'])
quality.head(10).to_csv('quality.csv', index=False)
print("Quality of first 10 batches")
quality.head(10)

Number of poor quality batches = 2224
Quality of first 10 batches

all(quality==quality_SQL)

True

# all but the first column in dataframe measurements are input features
features = list(measurements)[1:] # column headers for input features
print("Number of input features = %d" %len(features))
print(', '.join(features)) # display features

k = 4000 # use first k samples for training
print("First %d samples used for training" %k)
X_train, y_train = measurements[features][:k], quality.quality[:k] # input, label for training
X_test, y_test = measurements[features][k:], quality.quality[k:] # input, label for validation

Number of input features = 30
test1, test2, test3, test4, test5, test6, test7, test8, test9, test10, test11, test12, test13, test14, test15, test16, test17, test18, test19, test20, test21, test22, test23, test24, test25, test26, test27, test28, test29, test30
First 4000 samples used for training

for k in range(2, 20): # maximum number of leaf-nodes (rules)
    model = DecisionTreeClassifier(max_leaf_nodes=k) # tree with k rules
    acc = cross_val_score(model, X_train, y_train, cv=5).mean() # mean accuracy in 5-fold cross-validation
    print("Cross validation accuracy with %d rules = %4.4f" %(k, acc))

Cross validation accuracy with 2 rules = 0.6165
Cross validation accuracy with 3 rules = 0.7688
Cross validation accuracy with 4 rules = 0.8820
Cross validation accuracy with 5 rules = 0.9615
Cross validation accuracy with 6 rules = 0.9613
Cross validation accuracy with 7 rules = 0.9605
Cross validation accuracy with 8 rules = 0.9603
Cross validation accuracy with 9 rules = 0.9585
Cross validation accuracy with 10 rules = 0.9580
Cross validation accuracy with 11 rules = 0.9587
Cross validation accuracy with 12 rules = 0.9577
Cross validation accuracy with 13 rules = 0.9583
Cross validation accuracy with 14 rules = 0.9580
Cross validation accuracy with 15 rules = 0.9565
Cross validation accuracy with 16 rules = 0.9570
Cross validation accuracy with 17 rules = 0.9557
Cross validation accuracy with 18 rules = 0.9555
Cross validation accuracy with 19 rules = 0.9553

k = 5 # chosen number of leaf nodes
model = DecisionTreeClassifier(max_leaf_nodes=k) # create model
model.fit(X_train, y_train) # train model

plt.figure(figsize=(10, 10)) # size of figure to be displayed

CL = model.classes_ # class labels (good / poor)
plot_tree(model, feature_names=features, class_names=CL,
          filled=True, rounded=True, impurity=False) # plot tree

plt.savefig("decision_tree.png") # save as PNG file
plt.show() # show tree

pred_train = model.predict(X_train) # predicted labels for training samples
acc_train = accuracy_score(y_train, pred_train) # training accuracy
print("Training accuracy  = %4.4f" %(acc_train))
# confusion matrix
cm = pd.DataFrame(confusion_matrix(y_train, pred_train), columns=[f'Predicted {c}' for c in CL], index=CL)
cm.to_csv('training_confusion_matrix.csv') # save confusion matrix
cm

Training accuracy  = 0.9637

print(classification_report(y_train, pred_train, digits=4))

              precision    recall  f1-score   support

        good     0.9655    0.9694    0.9675      2225
        poor     0.9615    0.9566    0.9591      1775

    accuracy                         0.9637      4000
   macro avg     0.9635    0.9630    0.9633      4000
weighted avg     0.9637    0.9637    0.9637      4000

pred_test = model.predict(X_test) # predicted labels for validation samples
acc_test = accuracy_score(y_test, pred_test) # validation accuracy
print("Validation accuracy  = %4.4f" %(acc_test))
cm = pd.DataFrame(confusion_matrix(y_test, pred_test), columns=[f'Predicted {c}' for c in CL], index=CL)
cm.to_csv('validation_confusion_matrix.csv')
cm

Validation accuracy  = 0.9700

print(classification_report(y_test, pred_test, digits=4))

              precision    recall  f1-score   support

        good     0.9745    0.9710    0.9727       551
        poor     0.9645    0.9688    0.9667       449

    accuracy                         0.9700      1000
   macro avg     0.9695    0.9699    0.9697      1000
weighted avg     0.9700    0.9700    0.9700      1000

new_batches.head()

new_batches

new_labels = pd.DataFrame()
new_labels['batchID'] = new_batches.batchID
new_labels['quality'] = model.predict(new_batches[features])
new_labels.to_csv("new_labels.csv", index=False)
new_labels

plt.figure(figsize=(16, 4)) # size of figure to be displayed
plt.bar(features, model.feature_importances_)
plt.xticks(rotation = 90, fontsize=16)
plt.title('Feature importances', fontsize=24)
plt.show()

	serial_nbr	batchID
0	S007B022832	B022832
1	S001B048935	B048935
2	S002B021675	B021675
3	S002B020304	B020304
4	S000B009453	B009453
5	S002B001585	B001585
6	S002B003338	B003338
7	S007B023528	B023528
8	S002B038837	B038837
9	S005B014832	B014832

	batchID	test1	test2	test3	test4	test5	test6	test7	test8	test9	...	test21	test22	test23	test24	test25	test26	test27	test28	test29	test30
0	B049738	396	460	816	125	79	513	576	259	369	...	19	918	692	958	45	381	758	466	478	777
1	B030478	792	474	779	109	843	985	946	644	839	...	948	612	207	360	663	570	398	49	335	919
2	B023741	715	929	462	718	977	271	479	785	231	...	168	995	221	456	489	120	530	165	397	203
3	B034393	92	39	359	625	354	292	672	749	119	...	277	224	984	233	541	690	480	444	2	387
4	B041622	93	804	641	580	771	231	163	922	800	...	483	319	322	249	421	439	993	807	634	205

	batchID	test1	test2	test3	test4	test5	test6	test7	test8	test9	...	test21	test22	test23	test24	test25	test26	test27	test28	test29	test30
0	B050257	933	898	591	719	639	522	321	420	245	...	246	365	13	53	670	691	594	661	101	284
1	B017350	346	888	469	821	443	351	839	3	990	...	597	427	131	949	694	118	350	683	369	415
2	B013761	479	860	229	357	560	708	153	654	552	...	871	609	147	804	21	68	316	627	877	461
3	B038024	308	711	786	327	672	767	715	155	260	...	545	429	418	264	798	860	732	764	506	390
4	B015511	302	383	590	211	746	224	833	481	286	...	167	117	211	348	225	611	191	462	964	314

	batchID	test1	test2	test3	test4	test5	test6	test7	test8	test9	...	test21	test22	test23	test24	test25	test26	test27	test28	test29	test30
0	B050257	933	898	591	719	639	522	321	420	245	...	246	365	13	53	670	691	594	661	101	284
1	B017350	346	888	469	821	443	351	839	3	990	...	597	427	131	949	694	118	350	683	369	415
2	B013761	479	860	229	357	560	708	153	654	552	...	871	609	147	804	21	68	316	627	877	461
3	B038024	308	711	786	327	672	767	715	155	260	...	545	429	418	264	798	860	732	764	506	390
4	B015511	302	383	590	211	746	224	833	481	286	...	167	117	211	348	225	611	191	462	964	314
5	B049530	81	47	756	751	626	964	585	506	104	...	153	388	590	118	200	262	590	338	184	443
6	B039271	95	950	247	938	373	552	306	268	643	...	784	513	56	5	66	797	518	364	492	309
7	B047406	134	487	555	238	899	822	285	754	672	...	157	134	21	686	99	532	104	344	35	487
8	B042002	134	878	598	699	34	46	135	684	885	...	784	257	736	711	870	630	418	233	20	202
9	B034981	975	723	52	256	986	63	307	645	116	...	394	467	103	464	59	584	117	611	769	545
10	B017251	883	543	944	392	39	531	119	416	310	...	179	85	591	553	728	380	542	723	419	950
11	B008720	401	967	526	74	556	287	956	277	19	...	581	950	371	632	423	843	998	0	411	284
12	B018975	206	887	290	235	647	40	240	16	834	...	589	400	918	805	609	926	629	79	444	561
13	B044633	953	821	562	135	118	429	982	722	62	...	151	608	546	485	766	935	360	469	888	156
14	B031398	330	904	490	459	74	95	556	988	389	...	588	707	153	479	571	784	332	724	993	375
15	B036850	628	998	104	193	989	135	478	2	969	...	38	910	36	80	929	612	940	10	36	227
16	B045777	564	518	795	769	208	128	735	240	762	...	871	476	400	441	881	158	753	234	613	304
17	B007795	528	252	254	663	389	420	210	549	418	...	157	446	965	128	83	142	4	163	780	633
18	B014767	3	571	471	655	787	427	682	600	979	...	966	791	469	884	507	541	465	843	310	586
19	B048223	679	578	821	17	165	766	439	415	397	...	955	455	629	788	751	468	979	241	318	153
20	B035035	380	163	993	394	868	106	147	612	847	...	886	34	37	971	747	922	613	468	203	506
21	B014448	840	187	18	774	435	289	822	231	234	...	899	187	226	206	924	124	523	535	333	184
22	B012586	58	203	558	812	841	652	242	21	805	...	232	499	209	662	922	86	295	508	187	97
23	B037453	279	962	704	560	633	948	390	902	44	...	728	155	344	287	993	926	51	306	537	273
24	B041726	897	912	572	21	671	739	255	859	547	...	376	461	129	720	138	365	834	911	471	913
25	B003675	76	731	77	597	189	137	856	121	113	...	327	55	539	597	19	292	698	560	365	602
26	B018601	938	569	488	871	368	20	608	783	868	...	686	830	583	84	87	334	371	741	773	421
27	B034924	455	421	944	127	461	842	125	554	521	...	37	740	480	615	111	926	444	306	242	722
28	B038527	171	238	586	189	381	333	6	957	865	...	590	510	560	876	983	589	811	25	413	526
29	B045822	333	391	426	274	632	575	218	39	283	...	331	326	290	710	288	318	552	830	911	44
30	B005577	373	121	97	371	940	71	736	768	317	...	496	160	187	613	778	112	418	175	571	871
31	B027718	107	759	601	78	196	23	860	480	738	...	818	130	143	216	666	395	857	799	793	828
32	B004606	378	418	674	12	163	508	289	224	713	...	476	253	587	232	804	664	219	49	804	647
33	B038664	995	804	512	815	774	980	659	145	910	...	415	116	19	598	806	105	225	612	547	399
34	B043264	195	661	468	167	380	746	655	904	65	...	422	445	19	223	78	960	266	33	895	720
35	B027719	40	545	785	445	982	446	298	610	939	...	592	746	339	300	604	859	135	866	911	720
36	B042165	197	424	39	506	516	236	417	270	20	...	522	76	786	143	871	426	435	997	315	486
37	B019164	853	547	100	940	208	807	893	750	122	...	558	389	382	262	261	413	42	836	662	933
38	B048145	366	815	838	166	334	648	859	310	489	...	176	915	363	81	936	793	937	124	388	994
39	B036549	65	584	685	737	420	490	149	561	61	...	429	239	634	595	414	785	41	753	7	804

MMIS692 Quality Control¶

Mount Drive¶

Import libraries¶

Get data¶

Label batches¶

Using SQL¶

Using pandas¶

Confirm that quality results are identical¶

Reserve data for validation¶

Cross validation accuracy¶

Choose a good model¶

Results with training data¶

Results with validation data¶

Predict quality for unlabeled samples¶

Feature importances¶

	batchID	quality
0	B049738	poor
1	B030478	good
2	B023741	good
3	B034393	good
4	B041622	good
5	B016158	poor
6	B005914	good
7	B000418	poor
8	B021404	poor
9	B024090	poor

	Predicted good	Predicted poor
good	2157	68
poor	77	1698