Data Analysis Agent
Description
Data Analysis Agent - Your AI-Powered Data Insights Companion
Turn your CSV files into actionable insights with natural conversation. Simply upload your data and ask questions in plain English.
What You Can Do:
📊 Instant Overview: “Summarize my data” or “What’s in this dataset?”
🔍 Deep Analysis: “Show correlations” or “What are the trends?”
📈 Smart Visualizations: “Create a histogram of sales” or “Plot customer distribution”
💡 Pattern Discovery: “Find outliers” or “What are the relationships between columns?”
🎯 Quick Stats: “Show value counts for region” or “What’s the average revenue?”
Key Features:
No Code Required: Ask questions naturally, get visual answers
Interactive Charts: Auto-generated plots and visualizations
Statistical Analysis: Correlations, distributions, and descriptive stats
Flexible AI Options: Use OpenAI API or run models locally in your browser with WebLLM
Privacy First: All analysis happens client-side - your data never leaves your device
Perfect For:
Quick data exploration without spreadsheet formulas
Business analysts needing rapid insights
Students learning data science concepts
Anyone wanting to understand their data better
How It Works:
Upload your CSV file
Ask questions about your data
Get instant answers with charts and statistics
Dive deeper with follow-up questions
Agent Source Code
# Data Analysis Agent - Data Processing and Insights
# This template provides data analysis capabilities with pandas and numpy
import pandas as pd
import numpy as np
import json
# Template Variables - Users can customize these
max_rows = 1000
precision = 2
chart_type = "bar"
include_summary = True
# Global variable to store loaded data
current_data = None
current_filename = None
from langchain_core.tools import tool
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
print("[OK] LangChain core imports successful")
# Import provider-specific components
from langchain_openai import ChatOpenAI
print("[OK] LangChain OpenAI import successful")
print("[OK] All LangChain imports successful - tool calling enabled")
def dataframe_to_markdown(df, max_rows=10):
"""Convert pandas DataFrame to markdown table format."""
# Limit rows if needed
if len(df) > max_rows:
df = df.head(max_rows)
truncated = True
else:
truncated = False
# Start building markdown table
lines = []
# Header row
headers = [''] + list(df.columns) # Add index column
lines.append('| ' + ' | '.join(str(h) for h in headers) + ' |')
# Separator row
lines.append('|' + '|'.join([' --- ' for _ in range(len(headers))]) + '|')
# Data rows
for idx, row in df.iterrows():
row_values = [str(idx)] + [str(v) for v in row]
lines.append('| ' + ' | '.join(row_values) + ' |')
if truncated:
lines.append(f'\n*Showing first {max_rows} rows of {len(df)} total*')
return '\n'.join(lines)
@tool
def get_data_summary() -> str:
"""Get dataset summary: shape, columns, data types, statistics."""
global current_data
if current_data is None:
return "No data loaded. Please upload a CSV file first."
result = []
result.append(f"## Dataset Overview")
result.append(f"Shape: {current_data.shape[0]} rows × {current_data.shape[1]} columns")
result.append(f"\nColumns: {', '.join(current_data.columns.tolist())}")
result.append("\n### Data Types:")
for col in current_data.columns:
dtype = str(current_data[col].dtype)
result.append(f"- **{col}**: {dtype}")
result.append("\n### Missing Values:")
missing = current_data.isnull().sum()
has_missing = False
for col in current_data.columns:
if missing[col] > 0:
pct = (missing[col] / len(current_data)) * 100
result.append(f"- **{col}**: {missing[col]} missing ({pct:.1f}%)")
has_missing = True
if not has_missing:
result.append("- ✅ No missing values found")
if current_data.select_dtypes(include=[np.number]).shape[1] > 0:
result.append("\n### Summary Statistics:")
# Convert describe() output to markdown table
stats_df = current_data.describe()
result.append(dataframe_to_markdown(stats_df))
return "\n".join(result)
@tool
def get_column_info() -> str:
"""Get column info: data types and missing values."""
global current_data
if current_data is None:
return "No data loaded. Please upload a CSV file first."
result = [f"## Column Information\n"]
result.append(f"Dataset has **{len(current_data.columns)} columns** and **{len(current_data)} rows**:\n")
# Create a table
result.append("| Column | Type | Non-Null | Missing | % Missing |")
result.append("| --- | --- | --- | --- | --- |")
for col in current_data.columns:
dtype = str(current_data[col].dtype)
non_null = current_data[col].count()
total = len(current_data)
missing = total - non_null
pct_missing = (missing / total) * 100
result.append(f"| {col} | {dtype} | {non_null} | {missing} | {pct_missing:.1f}% |")
return "\n".join(result)
@tool
def get_value_counts(column: str) -> str:
"""Get value counts for a specific column."""
global current_data
if current_data is None:
return "No data loaded. Please upload a CSV file first."
if column not in current_data.columns:
return f"❌ Column '{column}' not found. Available columns: {', '.join(current_data.columns)}"
result = [f"## Value counts for '{column}'\n"]
value_counts = current_data[column].value_counts().head(15)
total = len(current_data)
# Create markdown table
result.append("| Value | Count | Percentage |")
result.append("| --- | --- | --- |")
for value, count in value_counts.items():
pct = (count / total) * 100
result.append(f"| {value} | {count} | {pct:.1f}% |")
unique_count = current_data[column].nunique()
if unique_count > 15:
result.append(f"\n*Showing top 15 of {unique_count} unique values*")
else:
result.append(f"\n*Total unique values: {unique_count}*")
return "\n".join(result)
@tool
def create_chart(column: str, chart_type: str = "histogram") -> str:
"""Create a chart for a specific column."""
global current_data
if current_data is None:
return "No data loaded. Please upload a CSV file first."
if column not in current_data.columns:
return f"❌ Column '{column}' not found. Available columns: {', '.join(current_data.columns)}"
try:
try:
import matplotlib.pyplot as plt
import base64
from io import BytesIO
plt.ioff()
except ImportError:
return "❌ Chart creation unavailable: matplotlib not installed. You can still analyze the data using other functions."
fig, ax = plt.subplots(figsize=(10, 6))
if chart_type.lower() == "bar":
value_counts = current_data[column].value_counts().head(10)
ax.bar(range(len(value_counts)), value_counts.values, color='#059669')
ax.set_xticks(range(len(value_counts)))
ax.set_xticklabels(value_counts.index, rotation=45, ha='right')
ax.set_ylabel('Count')
ax.set_title(f'Bar Chart: {column}', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
elif chart_type.lower() == "histogram":
if pd.api.types.is_numeric_dtype(current_data[column]):
ax.hist(current_data[column].dropna(), bins=20, alpha=0.7, color='#059669', edgecolor='white')
ax.set_xlabel(column)
ax.set_ylabel('Frequency')
ax.set_title(f'Histogram: {column}', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
else:
plt.close(fig)
return f"❌ Cannot create histogram for non-numeric column '{column}'. Try 'bar' chart instead."
else:
plt.close(fig)
return f"❌ Unsupported chart type '{chart_type}'. Use: bar or histogram"
plt.tight_layout()
buffer = BytesIO()
plt.savefig(buffer, format='png', dpi=100, bbox_inches='tight')
buffer.seek(0)
image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
plt.close(fig)
return f"""✅ Chart created successfully for '{column}' ({chart_type} chart).
<img src="data:image/png;base64,{image_base64}" alt="{chart_type.title()} chart for {column}" style="max-width: 100%; height: auto; margin: 10px 0; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">
Chart shows the distribution of values in the '{column}' column."""
except Exception as e:
return f"❌ Error creating chart: {str(e)}"
@tool
def get_correlation_analysis() -> str:
"""Get correlation analysis for numeric columns."""
global current_data
if current_data is None:
return "No data loaded. Please upload a CSV file first."
numeric_cols = current_data.select_dtypes(include=[np.number]).columns
if len(numeric_cols) < 2:
return "❌ Need at least 2 numerical columns to calculate correlations."
corr_matrix = current_data[numeric_cols].corr()
result = ["## Correlation Analysis\n"]
result.append("### Correlation Matrix:\n")
# Create correlation table using markdown format
result.append(dataframe_to_markdown(corr_matrix, max_rows=20))
result.append("\n### Key Insights:")
strong_corr = []
for i in range(len(numeric_cols)):
for j in range(i+1, len(numeric_cols)):
corr_val = corr_matrix.iloc[i, j]
if abs(corr_val) > 0.7:
strength = "strong positive" if corr_val > 0 else "strong negative"
emoji = "📈" if corr_val > 0 else "📉"
strong_corr.append(f"- {emoji} **{numeric_cols[i]}** and **{numeric_cols[j]}**: {strength} correlation ({corr_val:.3f})")
if strong_corr:
result.extend(strong_corr)
else:
result.append("- ℹ️ No strong correlations found (|r| > 0.7)")
return "\n".join(result)
async def process_user_query(query: str) -> str:
"""Main function to process user queries"""
# Check if provider is 'local' (WebLLM)
provider = globals().get('PROVIDER', 'openai')
if provider == 'local':
# Use WebLLM (local in-browser inference) with Nous Research tool calling
print(f"[Local Mode] Using WebLLM with tool calling")
try:
# Import async/await support
from js import webllmEngine
import json as json_module
# Check if WebLLM engine is loaded
if not webllmEngine:
return """❌ WebLLM model not loaded. Please click the "🚀 Load Model" button at the top of the page first.
Then you can ask questions like:
- 'Show me a summary of the data'
- 'Create a histogram of sales'
- 'Show correlations between variables'"""
# Define available tools
tools = [get_data_summary, get_column_info, get_value_counts, create_chart, get_correlation_analysis]
# Create Nous Research system prompt with tool definitions
# Use template's system_prompt field for domain instructions
# TEMPLATE_SYSTEM_PROMPT is injected by AgentHTMLGenerator from template.system_prompt
system_prompt = create_nous_research_system_prompt(
tools,
current_data,
current_filename,
TEMPLATE_SYSTEM_PROMPT # Gets value from template.system_prompt field
)
print(f"[WebLLM] Calling model with Nous Research prompt")
print(f"[WebLLM] User query: {query}")
# Call WebLLM with system prompt and user query
# Use the bridge function that's already available
result_json = await call_webllm_from_python(query, system_prompt)
print(f"[WebLLM] Raw response length: {len(result_json)}")
print(f"[WebLLM] Response preview: {result_json[:200]}")
# Parse tool calls from response
tool_calls = parse_hermes_tool_calls(result_json)
if tool_calls:
print(f"[WebLLM] Found {len(tool_calls)} tool calls")
# Execute the first tool call
tool_call = tool_calls[0]
tool_name = tool_call['name']
tool_args = tool_call['args']
print(f"[WebLLM] Executing tool: {tool_name} with args: {tool_args}")
# Find the tool by name
tool_map = {
'get_data_summary': get_data_summary,
'get_column_info': get_column_info,
'get_value_counts': get_value_counts,
'create_chart': create_chart,
'get_correlation_analysis': get_correlation_analysis
}
if tool_name in tool_map:
tool = tool_map[tool_name]
result = tool.invoke(tool_args)
print(f"[WebLLM] Tool execution successful, result length: {len(result)}")
return result
else:
return f"❌ Unknown tool: {tool_name}"
else:
# No tool calls found - return the raw response
print(f"[WebLLM] No tool calls found, returning raw response")
# Check if response contains helpful text
if len(result_json) > 50:
return result_json
else:
return """I understand you want to analyze your data, but I couldn't determine which specific tool to use.
Please try asking more specifically:
- 'Show me a summary of the data'
- 'Create a histogram of [column name]'
- 'Show correlations between numeric columns'
- 'Show value counts for [column name]'"""
except Exception as e:
print(f"[WebLLM Error] {str(e)}")
import traceback
traceback.print_exc()
return f"""❌ Error using WebLLM: {str(e)}
You can still use these direct commands:
- 'Show me a summary' - Dataset overview
- 'Show correlations' - Correlation analysis
- 'Create histogram of [column]' - Visualizations"""
# Cloud provider mode (openai/anthropic) - use AI with tool calling
try:
# Get API key from global variable (set by JavaScript)
api_key = globals().get('current_api_key', '')
# Debug output
print(f"Debug: API key length: {len(api_key) if api_key else 0}")
if not api_key:
return "⚠️ Please enter an OpenAI API key to use AI-powered analysis features.\n\nYou can still use basic analysis commands like:\n- 'Show me a summary'\n- 'Show correlation between variables'\n- 'Create a histogram'"
llm = ChatOpenAI(
model="gpt-4o-mini",
api_key=api_key
)
# Test API key with a simple call first
try:
test_response = llm.invoke([HumanMessage(content="Hello, respond with just 'API key working'")])
print(f"Debug: API test response: {test_response.content}")
except Exception as e:
return f"❌ API key error: {str(e)}\n\nPlease check your OpenAI API key and try again."
tools = [get_data_summary, get_column_info, get_value_counts, create_chart, get_correlation_analysis]
llm_with_tools = llm.bind_tools(tools)
# Enhanced prompt to encourage tool usage
system_prompt = """You are a data analysis assistant with access to specialized tools for analyzing uploaded CSV data.
IMPORTANT: You have access to these tools - always use them when relevant:
- get_data_summary(): Get dataset overview, shape, columns, data types
- get_column_info(): Get detailed information about specific columns
- get_value_counts(column): Get value distribution for a column
- create_chart(column, chart_type): Create visualizations (histogram, bar)
- get_correlation_analysis(): Analyze correlations between numeric columns
When the user asks for analysis, charts, or insights, USE THESE TOOLS rather than providing generic code examples.
Current request: {query}"""
messages = [HumanMessage(content=system_prompt.format(query=query))]
for _ in range(3): # Max 3 iterations
response = llm_with_tools.invoke(messages)
messages.append(response)
print(f"Debug: Response has {len(response.tool_calls) if response.tool_calls else 0} tool calls")
if not response.tool_calls:
break
for tool_call in response.tool_calls:
try:
tool_name = tool_call['name']
tool_args = tool_call['args']
# Find and execute the tool
tool_map = {
'get_data_summary': get_data_summary,
'get_column_info': get_column_info,
'get_value_counts': get_value_counts,
'create_chart': create_chart,
'get_correlation_analysis': get_correlation_analysis
}
if tool_name in tool_map:
tool = tool_map[tool_name]
tool_result = tool.invoke(tool_args)
messages.append(ToolMessage(content=tool_result, tool_call_id=tool_call['id']))
print(f"[OK] Tool {tool_name} executed successfully")
except Exception as e:
error_msg = f"Tool execution error: {str(e)}"
messages.append(ToolMessage(content=error_msg, tool_call_id=tool_call['id']))
print(f"[ERROR] Tool {tool_name} failed: {error_msg}")
# Return the last tool result directly without another API call
# This prevents the browser from freezing
tool_results = []
for msg in messages:
if isinstance(msg, ToolMessage):
tool_results.append(msg.content)
if tool_results:
print(f"[OK] Returning tool results directly, count: {len(tool_results)}")
return tool_results[-1] # Return last tool result (e.g., chart HTML)
# If no tools were executed, get AI response
print("[PROCESSING] Getting final response from AI...")
final_response = llm.invoke(messages)
print(f"[OK] Final response received, length: {len(final_response.content)}")
return final_response.content
except Exception as e:
return f"❌ Error processing query: {str(e)}"
def load_csv_data(csv_content: str, filename: str = "data.csv"):
"""Load CSV data into global variable"""
global current_data, current_filename
try:
from io import StringIO
current_data = pd.read_csv(StringIO(csv_content))
current_filename = filename
return f"✅ Loaded {current_data.shape[0]} rows and {current_data.shape[1]} columns from {filename}"
except Exception as e:
return f"❌ Error loading CSV: {str(e)}"