machine-learning-atoz/section_1_data_processing/main.py

43 lines
1.2 KiB
Python
Raw Normal View History

2024-10-25 11:48:10 +00:00
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# 引入資料集
dataset = pd.read_csv("Data.csv")
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values
# 缺損資料處理
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 1: 3] = imputer.fit_transform(X[:, 1:3])
# 分類資料的處理
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[
('col-0', OneHotEncoder(), [0])
], remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype=float)
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
# 將資料集分為訓練集和測試集
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
# 特徵縮放
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(X_train)