Kwangmin Kim - CSV/Excel 데이터 분석 에이전트

Pandas DataFrame 을 활용하여 분석을 수행하는 Agent 를 생성할 수 있다.
CSV/Excel 데이터로부터 Pandas DataFrame 객체를 생성할 수 있으며, 이를 활용하여 Agent 가 Pandas query 를 생성하여 분석을 수행할 수 있다.

# API 키를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# API 키 정보 로드
load_dotenv()

# LangSmith 추적을 설정합니다. https://smith.langchain.com
# !pip install -qU langchain-teddynote
from langchain_teddynote import logging

# 프로젝트 이름을 입력합니다.
logging.langsmith("CH15-Agent-CSV-Excel")

import pandas as pd

df = pd.read_csv("./data/titanic.csv")  # CSV 파일을 읽습니다.
# df2 = pd.read_excel("./data/titanic.xlsx", sheet_name="Sheet1") # 엑셀 파일도 읽을 수 있습니다.
df.head()

from langchain_experimental.tools import PythonAstREPLTool

# 파이썬 코드를 실행하는 도구를 생성합니다.
python_tool = PythonAstREPLTool()
python_tool.locals["df"] = df


# 도구 호출 시 실행되는 콜백 함수입니다.
def tool_callback(tool) -> None:
    print(f"<<<<<<< Code >>>>>>")
    if tool_name := tool.get("tool"):  # 도구에 입력된 값이 있다면
        if tool_name == "python_repl_ast":
            tool_input = tool.get("tool_input")
            for k, v in tool_input.items():
                if k == "query":
                    print(v)  # Query 를 출력합니다.
                    result = python_tool.invoke({"query": v})
                    print(result)
    print(f"<<<<<<< Code >>>>>>")


# 관찰 결과를 출력하는 콜백 함수입니다.
def observation_callback(observation) -> None:
    print(f"<<<<<<< Message >>>>>>")
    if "observation" in observation:
        print(observation["observation"])
    print(f"<<<<<<< Message >>>>>>")


# 최종 결과를 출력하는 콜백 함수입니다.
def result_callback(result: str) -> None:
    print(f"<<<<<<< 최종 답변 >>>>>>")
    print(result)
    print(f"<<<<<<< 최종 답변 >>>>>>")

from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
from langchain_openai import ChatOpenAI
from langchain_teddynote.messages import AgentStreamParser, AgentCallbacks

agent = create_pandas_dataframe_agent(
    ChatOpenAI(model="gpt-4o", temperature=0),
    df,
    verbose=False,
    agent_type="tool-calling",
    allow_dangerous_code=True,
    prefix="You are a professional data analyst and expert in Pandas. "
    "You must use Pandas DataFrame(`df`) to answer user's request. "
    "\n\n[IMPORTANT] DO NOT create or overwrite the `df` variable in your code. \n\n"
    "If you are willing to generate visualization code, please use `plt.show()` at the end of your code. "
    "I prefer seaborn code for visualization, but you can use matplotlib as well."
    "\n\n<Visualization Preference>\n"
    "- `muted` cmap, white background, and no grid for your visualization."
    "\nRecomment to set palette parameter for seaborn plot.",
)

parser_callback = AgentCallbacks(tool_callback, observation_callback, result_callback)
stream_parser = AgentStreamParser(parser_callback)

def ask(query):
    # 질의에 대한 답변을 출력합니다.
    response = agent.stream({"input": query})

    for step in response:
        stream_parser.process_agent_steps(step)

# 질의에 대한 답변을 출력합니다.
response = agent.stream({"input": "corr() 을 구해서 히트맵 시각화"})

for step in response:
    stream_parser.process_agent_steps(step)

ask("몇 개의 행이 있어?")

[도구 호출]
Tool: python_repl_ast
query: len(df)
Log: 
Invoking: `python_repl_ast` with `{'query': 'len(df)'}`



[관찰 내용]
Observation: 891
[최종 답변]
데이터프레임 `df`에는 총 891개의 행이 있습니다.

ask("남자와 여자의 생존율의 차이는 몇이야?")

[도구 호출]
Tool: python_repl_ast
query: male_survival_rate = df[df['Sex'] == 'male']['Survived'].mean()
female_survival_rate = df[df['Sex'] == 'female']['Survived'].mean()
survival_rate_difference = female_survival_rate - male_survival_rate
survival_rate_difference
Log: 
Invoking: `python_repl_ast` with `{'query': "male_survival_rate = df[df['Sex'] == 'male']['Survived'].mean()\nfemale_survival_rate = df[df['Sex'] == 'female']['Survived'].mean()\nsurvival_rate_difference = female_survival_rate - male_survival_rate\nsurvival_rate_difference"}`



[관찰 내용]
Observation: 0.5531300709799203
[최종 답변]
남자와 여자의 생존율의 차이는 약 0.55입니다. 즉, 여자의 생존율이 남자보다 약 55.3% 더 높습니다.

ask("남자 승객과 여자 승객의 생존율을 구한뒤 barplot 차트로 시각화 해줘")

[도구 호출]
Tool: python_repl_ast
query: import pandas as pd
import matplotlib.pyplot as plt

# 남자와 여자 승객의 생존율 계산
survival_rate = df.groupby('Sex')['Survived'].mean()

# barplot 시각화
survival_rate.plot(kind='bar', color=['blue', 'pink'])
plt.title('Survival Rate by Gender')
plt.xlabel('Gender')
plt.ylabel('Survival Rate')
plt.xticks(rotation=0)
plt.show()
Log: 
Invoking: `python_repl_ast` with `{'query': "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# 남자와 여자 승객의 생존율 계산\nsurvival_rate = df.groupby('Sex')['Survived'].mean()\n\n# barplot 시각화\nsurvival_rate.plot(kind='bar', color=['blue', 'pink'])\nplt.title('Survival Rate by Gender')\nplt.xlabel('Gender')\nplt.ylabel('Survival Rate')\nplt.xticks(rotation=0)\nplt.show()"}`



[관찰 내용]
Observation: 
[최종 답변]
남자 승객과 여자 승객의 생존율을 계산한 후, 바 차트로 시각화한 결과를 확인할 수 있습니다. 차트는 남성과 여성의 생존율을 비교하여 보여줍니다.

ask("1,2 등급에 탑승한 10세 이하 어린 아이의 성별별 생존율을 구하고 시각화 하세요")

[도구 호출]
Tool: python_repl_ast
query: import pandas as pd
import matplotlib.pyplot as plt

# 1, 2등급에 탑승한 10세 이하 어린 아이 필터링
children = df[(df['Pclass'].isin([1, 2])) & (df['Age'] <= 10)]

# 성별별 생존율 계산
survival_rate = children.groupby('Sex')['Survived'].mean() * 100

# 시각화
plt.figure(figsize=(8, 5))
plt.bar(survival_rate.index, survival_rate.values, color=['blue', 'pink'])
plt.title('Survival Rate of Children (Age <= 10) in 1st and 2nd Class by Gender')
plt.xlabel('Gender')
plt.ylabel('Survival Rate (%)')
plt.ylim(0, 100)
plt.grid(axis='y')
plt.show()
Log: 
Invoking: `python_repl_ast` with `{'query': "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# 1, 2등급에 탑승한 10세 이하 어린 아이 필터링\nchildren = df[(df['Pclass'].isin([1, 2])) & (df['Age'] <= 10)]\n\n# 성별별 생존율 계산\nsurvival_rate = children.groupby('Sex')['Survived'].mean() * 100\n\n# 시각화\nplt.figure(figsize=(8, 5))\nplt.bar(survival_rate.index, survival_rate.values, color=['blue', 'pink'])\nplt.title('Survival Rate of Children (Age <= 10) in 1st and 2nd Class by Gender')\nplt.xlabel('Gender')\nplt.ylabel('Survival Rate (%)')\nplt.ylim(0, 100)\nplt.grid(axis='y')\nplt.show()"}`

[관찰 내용]
Observation: 
[최종 답변]
위의 코드를 실행하여 1, 2등급에 탑승한 10세 이하 어린 아이의 성별별 생존율을 계산하고 시각화했습니다. 결과는 다음과 같은 막대 그래프로 나타납니다. 각 성별에 따른 생존율을 확인할 수 있습니다.