# Part 6: 머신러닝 기초 실습 (Scikit-learn)

# 이 스크립트를 실행하기 전에 라이브러리를 설치해야 합니다.
# pip install scikit-learn matplotlib

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- 데이터 준비 ---
print("--- 1. 데이터 준비 ---")
# Scikit-learn에 내장된 붓꽃(Iris) 데이터셋 로드
iris = load_iris()
X = iris.data  # 특성 (꽃받침 길이/너비, 꽃잎 길이/너비)
y = iris.target # 타겟 (품종)

print(f"데이터셋 크기: {X.shape}")
print(f"타겟 크기: {y.shape}")
print(f"클래스 종류: {iris.target_names}")
print("-" * 30)


# --- 훈련/테스트 데이터 분리 ---
print("\n--- 2. 훈련/테스트 데이터 분리 ---")
# 데이터를 훈련용 80%, 테스트용 20%로 분리
# random_state를 고정하여 항상 같은 결과로 분리되도록 함
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"훈련 데이터 크기: {X_train.shape}")
print(f"테스트 데이터 크기: {X_test.shape}")
print("-" * 30)


# --- 데이터 스케일링 (특성 표준화) ---
print("\n--- 3. 데이터 스케일링 ---")
# 특성의 스케일이 다를 경우 모델 성능에 영향을 줄 수 있으므로 표준화 수행
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("데이터 스케일링 완료 (StandardScaler 적용)")
print("-" * 30)


# --- 모델 훈련 ---
print("\n--- 4. 모델 훈련 ---")

# 모델 1: 로지스틱 회귀 (Logistic Regression)
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)
print("로지스틱 회귀 모델 훈련 완료")

# 모델 2: 결정 트리 (Decision Tree)
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_clf.fit(X_train_scaled, y_train)
print("결정 트리 모델 훈련 완료")
print("-" * 30)


# --- 예측 및 평가 ---
print("\n--- 5. 예측 및 평가 ---")

# 로지스틱 회귀 모델 평가
y_pred_log_reg = log_reg.predict(X_test_scaled)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print("\n--- 로지스틱 회귀 평가 결과 ---")
print(f"정확도(Accuracy): {accuracy_log_reg:.4f}")
print("분류 보고서(Classification Report):")
print(classification_report(y_test, y_pred_log_reg, target_names=iris.target_names))

# 결정 트리 모델 평가
y_pred_tree = tree_clf.predict(X_test_scaled)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print("\n--- 결정 트리 평가 결과 ---")
print(f"정확도(Accuracy): {accuracy_tree:.4f}")
print("분류 보고서(Classification Report):")
print(classification_report(y_test, y_pred_tree, target_names=iris.target_names))
print("-" * 30)


# --- 결과 시각화 (혼동 행렬) ---
print("\n--- 6. 결과 시각화 ---")

# 그래프 저장 디렉터리 확인
output_dir = "plot_outputs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
print(f"시각화 결과는 '{output_dir}' 폴더에 저장됩니다.")

# 로지스틱 회귀 혼동 행렬
cm_log_reg = confusion_matrix(y_test, y_pred_log_reg)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_log_reg, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Logistic Regression - Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
log_reg_cm_path = os.path.join(output_dir, "logistic_regression_confusion_matrix.png")
plt.savefig(log_reg_cm_path)
print(f"로지스틱 회귀 혼동 행렬 저장 완료: {log_reg_cm_path}")
plt.close()

# 결정 트리 혼동 행렬
cm_tree = confusion_matrix(y_test, y_pred_tree)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_tree, annot=True, fmt='d', cmap='Greens',
            xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Decision Tree - Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
tree_cm_path = os.path.join(output_dir, "decision_tree_confusion_matrix.png")
plt.savefig(tree_cm_path)
print(f"결정 트리 혼동 행렬 저장 완료: {tree_cm_path}")
plt.close()

print("-" * 30)