Upload New File

51627041 · Administrator · 9050bb5c · 51627041
Commit 51627041 authored Sep 04, 2025 by Administrator
--- a/2_services/ai_lecture/AI_EDUCATION_SUMMARY.md
+++ b/2_services/ai_lecture/AI_EDUCATION_SUMMARY.md
+# 🚀 AI 교육 종합 정리 문서
+## Python + NumPy + Pandas + Scikit-learn + Seaborn 마스터 가이드
+
+> **📅 작성일**: 2024년  
+> **🎯 목적**: 1달간 공부를 쉬어도 다시 보면 이해할 수 있는 종합 정리서  
+> **👥 대상**: AI 교육 과정을 수료한 직원들
+
+---
+
+## 📋 목차
+
+1. [Python 핵심 문법 요약](#1-python-핵심-문법-요약)
+2. [NumPy - 수치 계산의 기초](#2-numpy---수치-계산의-기초)
+3. [Pandas - 데이터 분석의 핵심](#3-pandas---데이터-분석의-핵심)
+4. [Scikit-learn - 머신러닝 라이브러리](#4-scikit-learn---머신러닝-라이브러리)
+5. [Seaborn - 데이터 시각화](#5-seaborn---데이터-시각화)
+6. [실전 프로젝트 예제](#6-실전-프로젝트-예제)
+7. [자주 사용하는 코드 패턴](#7-자주-사용하는-코드-패턴)
+8. [문제 해결 가이드](#8-문제-해결-가이드)
+
+---
+
+## 1. Python 핵심 문법 요약
+
+### 1.1 기본 데이터 타입과 변수
+
+```python
+# 숫자
+age = 25          # 정수
+height = 175.5    # 실수
+complex_num = 3 + 4j  # 복소수
+
+# 문자열
+name = "홍길동"
+message = '안녕하세요'
+multi_line = """
+여러 줄의
+문자열입니다
+"""
+
+# 불린
+is_student = True
+is_working = False
+
+# None (값이 없음)
+empty_value = None
+```
+
+### 1.2 컬렉션 데이터 타입
+
+```python
+# 리스트 (수정 가능, 순서 있음)
+fruits = ['사과', '바나나', '오렌지']
+fruits.append('포도')        # 추가
+fruits[0] = '배'           # 수정
+first_fruit = fruits[0]    # 접근
+
+# 튜플 (수정 불가, 순서 있음)
+coordinates = (10, 20)
+x, y = coordinates         # 언패킹
+
+# 딕셔너리 (키-값 쌍)
+person = {
+    'name': '김철수',
+    'age': 30,
+    'city': '서울'
+}
+person['job'] = '개발자'   # 추가
+name = person['name']      # 접근
+
+# 집합 (중복 없음, 순서 없음)
+unique_numbers = {1, 2, 3, 3, 4}  # 결과: {1, 2, 3, 4}
+```
+
+### 1.3 제어문
+
+```python
+# if-elif-else
+score = 85
+if score >= 90:
+    grade = 'A'
+elif score >= 80:
+    grade = 'B'
+elif score >= 70:
+    grade = 'C'
+else:
+    grade = 'D'
+
+# for 반복문
+for i in range(5):
+    print(i)  # 0, 1, 2, 3, 4
+
+for fruit in fruits:
+    print(f"과일: {fruit}")
+
+# while 반복문
+count = 0
+while count < 3:
+    print(f"카운트: {count}")
+    count += 1
+
+# 리스트 컴프리헨션
+squares = [x**2 for x in range(5)]  # [0, 1, 4, 9, 16]
+even_squares = [x**2 for x in range(10) if x % 2 == 0]
+```
+
+### 1.4 함수
+
+```python
+# 기본 함수 정의
+def greet(name, age=20):
+    """사람에게 인사하는 함수"""
+    return f"안녕하세요, {name}님! 나이는 {age}세입니다."
+
+# 함수 호출
+message = greet("김철수", 25)
+message2 = greet("이영희")  # age는 기본값 20 사용
+
+# 람다 함수 (간단한 한 줄 함수)
+add = lambda x, y: x + y
+result = add(3, 5)  # 8
+
+# 여러 값 반환
+def get_info():
+    return "홍길동", 30, "서울"
+
+name, age, city = get_info()
+```
+
+### 1.5 클래스와 객체지향 프로그래밍
+
+```python
+class Person:
+    def __init__(self, name, age):
+        self.name = name
+        self.age = age
+    
+    def introduce(self):
+        return f"저는 {self.name}이고, {self.age}살입니다."
+    
+    def have_birthday(self):
+        self.age += 1
+        return f"{self.name}의 생일! 나이는 {self.age}살이 되었습니다."
+
+# 객체 생성과 사용
+person1 = Person("김철수", 25)
+print(person1.introduce())  # 저는 김철수이고, 25살입니다.
+print(person1.have_birthday())  # 김철수의 생일! 나이는 26살이 되었습니다.
+```
+
+---
+
+## 2. NumPy - 수치 계산의 기초
+
+### 2.1 NumPy 배열 생성과 기본 연산
+
+```python
+import numpy as np
+
+# 배열 생성
+arr1 = np.array([1, 2, 3, 4, 5])
+arr2 = np.array([[1, 2, 3], [4, 5, 6]])  # 2차원 배열
+
+# 특별한 배열 생성
+zeros = np.zeros((3, 4))      # 3x4 영행렬
+ones = np.ones((2, 3))        # 2x3 일행렬
+range_arr = np.arange(0, 10, 2)  # [0, 2, 4, 6, 8]
+linspace = np.linspace(0, 1, 5)  # [0, 0.25, 0.5, 0.75, 1]
+
+# 배열 속성
+print(f"배열 모양: {arr2.shape}")      # (2, 3)
+print(f"배열 차원: {arr2.ndim}")       # 2
+print(f"배열 크기: {arr2.size}")       # 6
+print(f"데이터 타입: {arr2.dtype}")    # int64
+```
+
+### 2.2 배열 인덱싱과 슬라이싱
+
+```python
+# 1차원 배열 인덱싱
+arr = np.array([10, 20, 30, 40, 50])
+print(arr[0])      # 10 (첫 번째 요소)
+print(arr[-1])     # 50 (마지막 요소)
+print(arr[1:4])    # [20, 30, 40] (1번부터 3번까지)
+print(arr[::2])    # [10, 30, 50] (2칸씩 건너뛰기)
+
+# 2차원 배열 인덱싱
+matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+print(matrix[0, 1])     # 2 (0행 1열)
+print(matrix[1:3, :])   # [[4, 5, 6], [7, 8, 9]] (1~2행, 모든 열)
+print(matrix[:, 1])     # [2, 5, 8] (모든 행, 1열)
+
+# 불린 인덱싱
+mask = arr > 30
+print(arr[mask])        # [40, 50] (30보다 큰 값들)
+```
+
+### 2.3 배열 연산
+
+```python
+# 기본 산술 연산
+a = np.array([1, 2, 3])
+b = np.array([4, 5, 6])
+
+print(a + b)      # [5, 7, 9] (덧셈)
+print(a - b)      # [-3, -3, -3] (뺄셈)
+print(a * b)      # [4, 10, 18] (요소별 곱셈)
+print(a / b)      # [0.25, 0.4, 0.5] (요소별 나눗셈)
+print(a ** 2)     # [1, 4, 9] (제곱)
+
+# 브로드캐스팅 (자동으로 크기 맞춤)
+arr = np.array([1, 2, 3, 4])
+print(arr + 10)   # [11, 12, 13, 14] (모든 요소에 10 더하기)
+print(arr * 2)    # [2, 4, 6, 8] (모든 요소에 2 곱하기)
+```
+
+### 2.4 통계 함수
+
+```python
+data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+
+print(f"평균: {np.mean(data)}")           # 5.5
+print(f"중앙값: {np.median(data)}")       # 5.5
+print(f"표준편차: {np.std(data)}")        # 3.0276503540974917
+print(f"분산: {np.var(data)}")           # 9.166666666666666
+print(f"최솟값: {np.min(data)}")         # 1
+print(f"최댓값: {np.max(data)}")         # 10
+print(f"합계: {np.sum(data)}")           # 55
+```
+
+### 2.5 선형대수 연산
+
+```python
+# 행렬 생성
+A = np.array([[1, 2], [3, 4]])
+B = np.array([[5, 6], [7, 8]])
+
+# 행렬 곱셈
+C = np.dot(A, B)  # 또는 A @ B
+print("행렬 곱셈:")
+print(C)
+
+# 행렬 전치
+A_T = A.T
+print("전치 행렬:")
+print(A_T)
+
+# 행렬식
+det_A = np.linalg.det(A)
+print(f"행렬식: {det_A}")
+
+# 역행렬
+A_inv = np.linalg.inv(A)
+print("역행렬:")
+print(A_inv)
+
+# 고유값과 고유벡터
+eigenvalues, eigenvectors = np.linalg.eig(A)
+print(f"고유값: {eigenvalues}")
+print("고유벡터:")
+print(eigenvectors)
+```
+
+---
+
+## 3. Pandas - 데이터 분석의 핵심
+
+### 3.1 Series와 DataFrame 생성
+
+```python
+import pandas as pd
+
+# Series 생성 (1차원 데이터)
+s1 = pd.Series([1, 3, 5, 7, 9])
+s2 = pd.Series([1, 3, 5, 7, 9], index=['a', 'b', 'c', 'd', 'e'])
+
+# DataFrame 생성 (2차원 데이터)
+data = {
+    '이름': ['김철수', '이영희', '박민수', '최지영'],
+    '나이': [25, 28, 22, 30],
+    '직업': ['개발자', '디자이너', '학생', '마케터'],
+    '급여': [3000, 3500, 0, 4000]
+}
+df = pd.DataFrame(data)
+
+# CSV 파일 읽기
+df_from_csv = pd.read_csv('data.csv')
+df_from_excel = pd.read_excel('data.xlsx')
+```
+
+### 3.2 데이터 탐색과 기본 정보
+
+```python
+# 기본 정보 확인
+print(df.info())           # 데이터 타입, 메모리 사용량 등
+print(df.describe())       # 수치형 데이터 통계 요약
+print(df.head())           # 처음 5행
+print(df.tail(3))          # 마지막 3행
+print(df.shape)            # (행 수, 열 수)
+print(df.columns)          # 열 이름들
+print(df.index)            # 인덱스
+
+# 데이터 타입 확인
+print(df.dtypes)
+
+# 결측값 확인
+print(df.isnull().sum())   # 각 열의 결측값 개수
+```
+
+### 3.3 데이터 선택과 필터링
+
+```python
+# 열 선택
+names = df['이름']                    # 단일 열
+subset = df[['이름', '나이']]        # 여러 열
+subset2 = df.loc[:, ['이름', '급여']] # loc 사용
+
+# 행 선택
+first_row = df.iloc[0]               # 첫 번째 행
+first_two = df.iloc[0:2]             # 처음 2행
+specific_rows = df.iloc[[0, 2]]      # 0번, 2번 행
+
+# 조건부 필터링
+young_people = df[df['나이'] < 25]   # 25세 미만
+high_salary = df[df['급여'] > 3000]  # 급여 3000 초과
+developers = df[df['직업'] == '개발자'] # 직업이 개발자인 사람
+
+# 복합 조건
+young_dev = df[(df['나이'] < 30) & (df['직업'] == '개발자')]
+```
+
+### 3.4 데이터 수정과 추가
+
+```python
+# 열 추가
+df['경력'] = [2, 5, 0, 8]
+df['급여등급'] = df['급여'].apply(lambda x: '고급' if x > 3500 else '일반')
+
+# 열 수정
+df.loc[df['직업'] == '학생', '급여'] = 1000
+
+# 행 추가
+new_row = pd.DataFrame({'이름': ['정수민'], '나이': [27], '직업': ['기획자'], '급여': [3800]})
+df = pd.concat([df, new_row], ignore_index=True)
+
+# 열 이름 변경
+df = df.rename(columns={'급여': '월급', '직업': '직종'})
+```
+
+### 3.5 데이터 그룹화와 집계
+
+```python
+# 그룹별 통계
+job_stats = df.groupby('직업').agg({
+    '나이': ['mean', 'min', 'max'],
+    '급여': ['mean', 'sum', 'count']
+})
+
+# 피벗 테이블
+pivot_table = df.pivot_table(
+    values='급여',
+    index='직업',
+    columns='급여등급',
+    aggfunc='mean',
+    fill_value=0
+)
+
+# 크로스탭
+cross_tab = pd.crosstab(df['직업'], df['급여등급'])
+```
+
+### 3.6 결측값 처리
+
+```python
+# 결측값 확인
+print(df.isnull().sum())
+
+# 결측값 처리 방법들
+df_drop = df.dropna()                    # 결측값이 있는 행 삭제
+df_fill = df.fillna(0)                   # 0으로 채우기
+df_fill_mean = df['급여'].fillna(df['급여'].mean())  # 평균으로 채우기
+df_interpolate = df.interpolate()        # 보간법으로 채우기
+```
+
+---
+
+## 4. Scikit-learn - 머신러닝 라이브러리
+
+### 4.1 머신러닝 기본 개념
+
+```python
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from sklearn.metrics import mean_squared_error, r2_score
+```
+
+### 4.2 데이터 전처리
+
+```python
+# 데이터 분할 (훈련/테스트)
+X = df[['나이', '경력']]  # 특성
+y = df['급여등급']        # 타겟
+
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42
+)
+
+# 특성 스케일링
+scaler = StandardScaler()
+X_train_scaled = scaler.fit_transform(X_train)
+X_test_scaled = scaler.transform(X_test)
+
+# 범주형 데이터 인코딩
+from sklearn.preprocessing import LabelEncoder
+le = LabelEncoder()
+y_train_encoded = le.fit_transform(y_train)
+y_test_encoded = le.transform(y_test)
+```
+
+### 4.3 지도학습 - 분류
+
+```python
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import SVC
+
+# 로지스틱 회귀
+lr = LogisticRegression(random_state=42)
+lr.fit(X_train_scaled, y_train_encoded)
+lr_pred = lr.predict(X_test_scaled)
+
+# 결정 트리
+dt = DecisionTreeClassifier(random_state=42)
+dt.fit(X_train_scaled, y_train_encoded)
+dt_pred = dt.predict(X_test_scaled)
+
+# 랜덤 포레스트
+rf = RandomForestClassifier(n_estimators=100, random_state=42)
+rf.fit(X_train_scaled, y_train_encoded)
+rf_pred = rf.predict(X_test_scaled)
+
+# 성능 평가
+print(f"로지스틱 회귀 정확도: {accuracy_score(y_test_encoded, lr_pred):.3f}")
+print(f"결정 트리 정확도: {accuracy_score(y_test_encoded, dt_pred):.3f}")
+print(f"랜덤 포레스트 정확도: {accuracy_score(y_test_encoded, rf_pred):.3f}")
+```
+
+### 4.4 지도학습 - 회귀
+
+```python
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+
+# 선형 회귀
+lr_reg = LinearRegression()
+lr_reg.fit(X_train_scaled, y_train)
+lr_pred_reg = lr_reg.predict(X_test_scaled)
+
+# 랜덤 포레스트 회귀
+rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
+rf_reg.fit(X_train_scaled, y_train)
+rf_pred_reg = rf_reg.predict(X_test_scaled)
+
+# 성능 평가
+print(f"선형 회귀 R²: {r2_score(y_test, lr_pred_reg):.3f}")
+print(f"랜덤 포레스트 R²: {r2_score(y_test, rf_pred_reg):.3f}")
+print(f"선형 회귀 MSE: {mean_squared_error(y_test, lr_pred_reg):.3f}")
+```
+
+### 4.5 비지도학습 - 군집화
+
+```python
+from sklearn.cluster import KMeans
+from sklearn.clustering import DBSCAN
+
+# K-means 군집화
+kmeans = KMeans(n_clusters=3, random_state=42)
+clusters = kmeans.fit_predict(X_scaled)
+
+# 군집 결과 시각화
+import matplotlib.pyplot as plt
+plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap='viridis')
+plt.title('K-means 군집화 결과')
+plt.show()
+
+# 최적 클러스터 수 찾기 (엘보우 메서드)
+inertias = []
+K_range = range(1, 11)
+for k in K_range:
+    kmeans = KMeans(n_clusters=k, random_state=42)
+    kmeans.fit(X_scaled)
+    inertias.append(kmeans.inertia_)
+
+plt.plot(K_range, inertias, 'bx-')
+plt.xlabel('k')
+plt.ylabel('Inertia')
+plt.title('엘보우 메서드')
+plt.show()
+```
+
+### 4.6 모델 성능 향상
+
+```python
+from sklearn.model_selection import GridSearchCV, cross_val_score
+
+# 교차 검증
+cv_scores = cross_val_score(rf, X_scaled, y_encoded, cv=5)
+print(f"교차 검증 점수: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
+
+# 하이퍼파라미터 튜닝
+param_grid = {
+    'n_estimators': [50, 100, 200],
+    'max_depth': [10, 20, None],
+    'min_samples_split': [2, 5, 10]
+}
+
+grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
+grid_search.fit(X_scaled, y_encoded)
+
+print(f"최적 파라미터: {grid_search.best_params_}")
+print(f"최고 점수: {grid_search.best_score_:.3f}")
+```
+
+---
+
+## 5. Seaborn - 데이터 시각화
+
+### 5.1 기본 시각화
+
+```python
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# 스타일 설정
+sns.set_style("whitegrid")
+plt.rcParams['font.family'] = 'Malgun Gothic'  # 한글 폰트 설정
+
+# 히스토그램
+plt.figure(figsize=(10, 6))
+sns.histplot(data=df, x='나이', bins=10, kde=True)
+plt.title('나이 분포')
+plt.show()
+
+# 박스플롯
+plt.figure(figsize=(10, 6))
+sns.boxplot(data=df, x='직업', y='급여')
+plt.title('직업별 급여 분포')
+plt.xticks(rotation=45)
+plt.show()
+```
+
+### 5.2 관계형 시각화
+
+```python
+# 산점도
+plt.figure(figsize=(10, 6))
+sns.scatterplot(data=df, x='나이', y='급여', hue='직업', size='경력')
+plt.title('나이와 급여의 관계')
+plt.show()
+
+# 회귀선이 있는 산점도
+plt.figure(figsize=(10, 6))
+sns.regplot(data=df, x='나이', y='급여', scatter_kws={'alpha':0.6})
+plt.title('나이와 급여의 선형 관계')
+plt.show()
+
+# 상관관계 히트맵
+numeric_cols = df.select_dtypes(include=[np.number])
+correlation_matrix = numeric_cols.corr()
+
+plt.figure(figsize=(8, 6))
+sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
+plt.title('상관관계 히트맵')
+plt.show()
+```
+
+### 5.3 분포 시각화
+
+```python
+# 분포 비교
+plt.figure(figsize=(12, 6))
+sns.kdeplot(data=df, x='급여', hue='직업', common_norm=False)
+plt.title('직업별 급여 분포')
+plt.show()
+
+# 바이올린 플롯
+plt.figure(figsize=(12, 6))
+sns.violinplot(data=df, x='직업', y='급여')
+plt.title('직업별 급여 분포 (바이올린 플롯)')
+plt.xticks(rotation=45)
+plt.show()
+
+# 페어플롯
+plt.figure(figsize=(12, 12))
+sns.pairplot(df, hue='직업', diag_kind='kde')
+plt.show()
+```
+
+### 5.4 카테고리형 데이터 시각화
+
+```python
+# 막대 그래프
+plt.figure(figsize=(10, 6))
+sns.countplot(data=df, x='직업')
+plt.title('직업별 인원 수')
+plt.xticks(rotation=45)
+plt.show()
+
+# 그룹별 통계
+plt.figure(figsize=(12, 6))
+sns.barplot(data=df, x='직업', y='급여', ci=95)
+plt.title('직업별 평균 급여 (95% 신뢰구간)')
+plt.xticks(rotation=45)
+plt.show()
+```
+
+---
+
+## 6. 실전 프로젝트 예제
+
+### 6.1 간단한 머신러닝 파이프라인
+
+```python
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# 1. 데이터 로드 및 탐색
+def load_and_explore_data(file_path):
+    """데이터를 로드하고 기본 정보를 탐색합니다."""
+    df = pd.read_csv(file_path)
+    
+    print("=== 데이터 기본 정보 ===")
+    print(f"데이터 크기: {df.shape}")
+    print(f"컬럼: {list(df.columns)}")
+    print("\n=== 데이터 미리보기 ===")
+    print(df.head())
+    print("\n=== 데이터 타입 ===")
+    print(df.dtypes)
+    print("\n=== 결측값 ===")
+    print(df.isnull().sum())
+    print("\n=== 수치형 데이터 통계 ===")
+    print(df.describe())
+    
+    return df
+
+# 2. 데이터 전처리
+def preprocess_data(df, target_column):
+    """데이터를 전처리합니다."""
+    # 결측값 처리
+    df_clean = df.dropna()
+    
+    # 특성과 타겟 분리
+    X = df_clean.drop(target_column, axis=1)
+    y = df_clean[target_column]
+    
+    # 수치형 컬럼만 선택
+    numeric_columns = X.select_dtypes(include=[np.number]).columns
+    X_numeric = X[numeric_columns]
+    
+    # 데이터 분할
+    X_train, X_test, y_train, y_test = train_test_split(
+        X_numeric, y, test_size=0.2, random_state=42
+    )
+    
+    # 스케일링
+    scaler = StandardScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+    
+    return X_train_scaled, X_test_scaled, y_train, y_test, scaler
+
+# 3. 모델 훈련 및 평가
+def train_and_evaluate_model(X_train, X_test, y_train, y_test):
+    """모델을 훈련하고 평가합니다."""
+    # 모델 훈련
+    model = RandomForestClassifier(n_estimators=100, random_state=42)
+    model.fit(X_train, y_train)
+    
+    # 예측
+    y_pred = model.predict(X_test)
+    
+    # 성능 평가
+    print("=== 분류 보고서 ===")
+    print(classification_report(y_test, y_pred))
+    
+    return model, y_pred
+
+# 4. 결과 시각화
+def visualize_results(df, target_column, model, X_test_scaled, y_test, y_pred):
+    """결과를 시각화합니다."""
+    # 특성 중요도
+    feature_importance = pd.DataFrame({
+        'feature': df.drop(target_column, axis=1).select_dtypes(include=[np.number]).columns,
+        'importance': model.feature_importances_
+    }).sort_values('importance', ascending=False)
+    
+    plt.figure(figsize=(10, 6))
+    sns.barplot(data=feature_importance, x='importance', y='feature')
+    plt.title('특성 중요도')
+    plt.show()
+    
+    # 예측 vs 실제 비교
+    plt.figure(figsize=(8, 6))
+    plt.scatter(range(len(y_test)), y_test, alpha=0.7, label='실제값')
+    plt.scatter(range(len(y_pred)), y_pred, alpha=0.7, label='예측값')
+    plt.xlabel('샘플')
+    plt.ylabel('값')
+    plt.title('예측값 vs 실제값 비교')
+    plt.legend()
+    plt.show()
+
+# 메인 실행 함수
+def main():
+    # 예시 데이터 (실제로는 파일에서 로드)
+    np.random.seed(42)
+    n_samples = 1000
+    
+    # 가상의 데이터 생성
+    data = {
+        'feature1': np.random.normal(0, 1, n_samples),
+        'feature2': np.random.normal(0, 1, n_samples),
+        'feature3': np.random.normal(0, 1, n_samples),
+        'target': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
+    }
+    
+    df = pd.DataFrame(data)
+    
+    # 1. 데이터 탐색
+    df = load_and_explore_data(df)
+    
+    # 2. 데이터 전처리
+    X_train_scaled, X_test_scaled, y_train, y_test, scaler = preprocess_data(
+        df, 'target'
+    )
+    
+    # 3. 모델 훈련 및 평가
+    model, y_pred = train_and_evaluate_model(
+        X_train_scaled, X_test_scaled, y_train, y_test
+    )
+    
+    # 4. 결과 시각화
+    visualize_results(df, 'target', model, X_test_scaled, y_test, y_pred)
+    
+    print("=== 프로젝트 완료! ===")
+
+if __name__ == "__main__":
+    main()
+```
+
+---
+
+## 7. 자주 사용하는 코드 패턴
+
+### 7.1 데이터 로딩 및 저장
+
+```python
+# 다양한 형식의 파일 읽기
+df_csv = pd.read_csv('data.csv', encoding='utf-8')
+df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1')
+df_json = pd.read_json('data.json')
+
+# 데이터 저장
+df.to_csv('output.csv', index=False, encoding='utf-8')
+df.to_excel('output.xlsx', index=False)
+df.to_json('output.json', orient='records')
+```
+
+### 7.2 데이터 정제
+
+```python
+# 중복 제거
+df_clean = df.drop_duplicates()
+
+# 컬럼명 정리
+df.columns = df.columns.str.lower().str.replace(' ', '_')
+
+# 데이터 타입 변환
+df['날짜'] = pd.to_datetime(df['날짜'])
+df['카테고리'] = df['카테고리'].astype('category')
+
+# 조건부 데이터 변환
+df['급여등급'] = np.where(df['급여'] > 3500, '고급', '일반')
+```
+
+### 7.3 시계열 데이터 처리
+
+```python
+# 날짜 인덱스 설정
+df['날짜'] = pd.to_datetime(df['날짜'])
+df.set_index('날짜', inplace=True)
+
+# 시간별 그룹화
+monthly_data = df.resample('M').mean()
+daily_data = df.resample('D').sum()
+
+# 이동평균
+df['이동평균_7일'] = df['값'].rolling(window=7).mean()
+```
+
+### 7.4 병렬 처리
+
+```python
+from multiprocessing import Pool
+import pandas as pd
+
+def process_chunk(chunk):
+    """데이터 청크를 처리하는 함수"""
+    return chunk.apply(some_processing_function)
+
+def parallel_process_data(df, n_processes=4):
+    """데이터를 병렬로 처리합니다."""
+    # 데이터를 청크로 분할
+    chunk_size = len(df) // n_processes
+    chunks = [df[i:i+chunk_size] for i in range(0, len(df), chunk_size)]
+    
+    # 병렬 처리
+    with Pool(n_processes) as pool:
+        results = pool.map(process_chunk, chunks)
+    
+    # 결과 합치기
+    return pd.concat(results, ignore_index=True)
+```
+
+---
+
+## 8. 문제 해결 가이드
+
+### 8.1 자주 발생하는 오류와 해결방법
+
+#### 8.1.1 인덱싱 오류
+```python
+# 오류: KeyError: 'column_name'
+# 해결: 컬럼 존재 여부 확인
+if 'column_name' in df.columns:
+    result = df['column_name']
+else:
+    print("컬럼이 존재하지 않습니다.")
+
+# 오류: IndexError: single positional indexer is out-of-bounds
+# 해결: 인덱스 범위 확인
+if len(df) > 0:
+    first_row = df.iloc[0]
+else:
+    print("데이터프레임이 비어있습니다.")
+```
+
+#### 8.1.2 데이터 타입 오류
+```python
+# 오류: TypeError: can't multiply sequence by non-int of type 'float'
+# 해결: 데이터 타입 변환
+df['수치컬럼'] = pd.to_numeric(df['수치컬럼'], errors='coerce')
+
+# 오류: ValueError: cannot convert float NaN to integer
+# 해결: 결측값 처리 후 변환
+df['정수컬럼'] = df['정수컬럼'].fillna(0).astype(int)
+```
+
+#### 8.1.3 메모리 오류
+```python
+# 대용량 데이터 처리 시 메모리 절약
+# 1. 데이터 타입 최적화
+df['작은정수'] = df['작은정수'].astype('int8')
+df['작은실수'] = df['작은실수'].astype('float32')
+
+# 2. 청크 단위 처리
+chunk_size = 10000
+for chunk in pd.read_csv('large_file.csv', chunksize=chunk_size):
+    process_chunk(chunk)
+```
+
+### 8.2 성능 최적화 팁
+
+```python
+# 1. 벡터화 연산 사용 (반복문 대신)
+# 느린 방법
+for i in range(len(df)):
+    df.loc[i, '새컬럼'] = df.loc[i, '컬럼1'] + df.loc[i, '컬럼2']
+
+# 빠른 방법
+df['새컬럼'] = df['컬럼1'] + df['컬럼2']
+
+# 2. 적절한 데이터 타입 사용
+df['카테고리'] = df['카테고리'].astype('category')  # 메모리 절약
+
+# 3. 불필요한 컬럼 제거
+df = df.drop(['사용하지않는컬럼1', '사용하지않는컬럼2'], axis=1)
+
+# 4. 인덱스 최적화
+df.set_index('자주사용하는컬럼', inplace=True)
+```
+
+### 8.3 디버깅 팁
+
+```python
+# 1. 데이터 상태 확인
+print(f"데이터프레임 크기: {df.shape}")
+print(f"메모리 사용량: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
+
+# 2. 중간 결과 확인
+print("전처리 후 데이터:")
+print(df.head())
+print(f"결측값: {df.isnull().sum()}")
+
+# 3. 예외 처리
+try:
+    result = some_function(df)
+except Exception as e:
+    print(f"오류 발생: {e}")
+    print(f"데이터프레임 정보: {df.info()}")
+    raise
+```
+
+---
+
+## 📚 추가 학습 자료
+
+### 온라인 리소스
+- **Python 공식 문서**: https://docs.python.org/
+- **NumPy 공식 문서**: https://numpy.org/doc/
+- **Pandas 공식 문서**: https://pandas.pydata.org/docs/
+- **Scikit-learn 공식 문서**: https://scikit-learn.org/stable/
+- **Seaborn 공식 문서**: https://seaborn.pydata.org/
+
+### 추천 도서
+- "파이썬 데이터 사이언스 핸드북" - Jake VanderPlas
+- "파이썬 머신러닝 완벽 가이드" - 권철민
+- "데이터 분석을 위한 파이썬 라이브러리" - Wes McKinney
+
+### 실습 프로젝트 아이디어
+1. **부동산 가격 예측**: 지역, 면적, 연도 등으로 가격 예측
+2. **고객 이탈 예측**: 고객 행동 데이터로 이탈 가능성 예측
+3. **제품 추천 시스템**: 사용자 행동 데이터로 제품 추천
+4. **감정 분석**: 텍스트 데이터로 감정 분류
+5. **이상치 탐지**: 금융 거래 데이터에서 이상 거래 탐지
+
+---
+
+## 🎯 마무리
+
+이 문서는 지금까지 배운 AI 교육 내용의 핵심을 정리한 것입니다. 1달 후에도 이 문서를 보면 주요 개념과 코드 패턴을 쉽게 떠올릴 수 있을 것입니다.
+
+**💡 기억하세요:**
+- **Python**: 프로그래밍의 기초, 모든 것의 시작
+- **NumPy**: 수치 계산의 핵심, 선형대수의 기초
+- **Pandas**: 데이터 분석의 도구, 데이터 전처리의 필수
+- **Scikit-learn**: 머신러닝의 실전, 모델링의 핵심
+- **Seaborn**: 데이터 시각화의 예술, 인사이트 발견의 도구
+
+**🚀 다음 단계:**
+- 이 문서의 예제들을 직접 실행해보세요
+- 자신만의 프로젝트를 시작해보세요
+- 새로운 라이브러리나 기법을 탐험해보세요
+
+**💪 꾸준한 연습이 실력을 만듭니다!**
+
+---
+
+*이 문서는 AI 교육 과정의 종합 정리서입니다. 궁금한 점이나 추가로 필요한 내용이 있으면 언제든 문의해주세요.*