AgentOp
🤖

Data Analysis Agent

Created by ozzo Nov 24, 2025 Public
Download
9 Downloads
0 Forks
0.0 Rating

Description

Data Analysis Agent - Your AI-Powered Data Insights Companion

Turn your CSV files into actionable insights with natural conversation. Simply upload your data and ask questions in plain English.

What You Can Do:

📊 Instant Overview: “Summarize my data” or “What’s in this dataset?”
🔍 Deep Analysis: “Show correlations” or “What are the trends?”
📈 Smart Visualizations: “Create a histogram of sales” or “Plot customer distribution”
💡 Pattern Discovery: “Find outliers” or “What are the relationships between columns?”
🎯 Quick Stats: “Show value counts for region” or “What’s the average revenue?”
Key Features:

No Code Required: Ask questions naturally, get visual answers
Interactive Charts: Auto-generated plots and visualizations
Statistical Analysis: Correlations, distributions, and descriptive stats
Flexible AI Options: Use OpenAI API or run models locally in your browser with WebLLM
Privacy First: All analysis happens client-side - your data never leaves your device
Perfect For:

Quick data exploration without spreadsheet formulas
Business analysts needing rapid insights
Students learning data science concepts
Anyone wanting to understand their data better
How It Works:

Upload your CSV file
Ask questions about your data
Get instant answers with charts and statistics
Dive deeper with follow-up questions

Agent Source Code

# Data Analysis Agent - Data Processing and Insights
# This template provides data analysis capabilities with pandas and numpy

import pandas as pd
import numpy as np
import json

# Template Variables - Users can customize these
max_rows = 1000
precision = 2
chart_type = "bar"
include_summary = True

# Global variable to store loaded data
current_data = None
current_filename = None

from langchain_core.tools import tool
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
print("[OK] LangChain core imports successful")

# Import provider-specific components
from langchain_openai import ChatOpenAI
print("[OK] LangChain OpenAI import successful")
print("[OK] All LangChain imports successful - tool calling enabled")

def dataframe_to_markdown(df, max_rows=10):
    """Convert pandas DataFrame to markdown table format."""
    # Limit rows if needed
    if len(df) > max_rows:
        df = df.head(max_rows)
        truncated = True
    else:
        truncated = False
    
    # Start building markdown table
    lines = []
    
    # Header row
    headers = [''] + list(df.columns)  # Add index column
    lines.append('| ' + ' | '.join(str(h) for h in headers) + ' |')
    
    # Separator row
    lines.append('|' + '|'.join([' --- ' for _ in range(len(headers))]) + '|')
    
    # Data rows
    for idx, row in df.iterrows():
        row_values = [str(idx)] + [str(v) for v in row]
        lines.append('| ' + ' | '.join(row_values) + ' |')
    
    if truncated:
        lines.append(f'\n*Showing first {max_rows} rows of {len(df)} total*')
    
    return '\n'.join(lines)

@tool
def get_data_summary() -> str:
    """Get dataset summary: shape, columns, data types, statistics."""
    global current_data
    if current_data is None:
        return "No data loaded. Please upload a CSV file first."
    
    result = []
    result.append(f"## Dataset Overview")
    result.append(f"Shape: {current_data.shape[0]} rows × {current_data.shape[1]} columns")
    result.append(f"\nColumns: {', '.join(current_data.columns.tolist())}")
    
    result.append("\n### Data Types:")
    for col in current_data.columns:
        dtype = str(current_data[col].dtype)
        result.append(f"- **{col}**: {dtype}")
    
    result.append("\n### Missing Values:")
    missing = current_data.isnull().sum()
    has_missing = False
    for col in current_data.columns:
        if missing[col] > 0:
            pct = (missing[col] / len(current_data)) * 100
            result.append(f"- **{col}**: {missing[col]} missing ({pct:.1f}%)")
            has_missing = True
    
    if not has_missing:
        result.append("- ✅ No missing values found")
        
    if current_data.select_dtypes(include=[np.number]).shape[1] > 0:
        result.append("\n### Summary Statistics:")
        # Convert describe() output to markdown table
        stats_df = current_data.describe()
        result.append(dataframe_to_markdown(stats_df))
    
    return "\n".join(result)

@tool
def get_column_info() -> str:
    """Get column info: data types and missing values."""
    global current_data
    if current_data is None:
        return "No data loaded. Please upload a CSV file first."
    
    result = [f"## Column Information\n"]
    result.append(f"Dataset has **{len(current_data.columns)} columns** and **{len(current_data)} rows**:\n")
    
    # Create a table
    result.append("| Column | Type | Non-Null | Missing | % Missing |")
    result.append("| --- | --- | --- | --- | --- |")
    
    for col in current_data.columns:
        dtype = str(current_data[col].dtype)
        non_null = current_data[col].count()
        total = len(current_data)
        missing = total - non_null
        pct_missing = (missing / total) * 100
        result.append(f"| {col} | {dtype} | {non_null} | {missing} | {pct_missing:.1f}% |")
    
    return "\n".join(result)

@tool
def get_value_counts(column: str) -> str:
    """Get value counts for a specific column."""
    global current_data
    if current_data is None:
        return "No data loaded. Please upload a CSV file first."
    
    if column not in current_data.columns:
        return f"❌ Column '{column}' not found. Available columns: {', '.join(current_data.columns)}"
    
    result = [f"## Value counts for '{column}'\n"]
    value_counts = current_data[column].value_counts().head(15)
    total = len(current_data)
    
    # Create markdown table
    result.append("| Value | Count | Percentage |")
    result.append("| --- | --- | --- |")
    
    for value, count in value_counts.items():
        pct = (count / total) * 100
        result.append(f"| {value} | {count} | {pct:.1f}% |")
    
    unique_count = current_data[column].nunique()
    if unique_count > 15:
        result.append(f"\n*Showing top 15 of {unique_count} unique values*")
    else:
        result.append(f"\n*Total unique values: {unique_count}*")
    
    return "\n".join(result)

@tool
def create_chart(column: str, chart_type: str = "histogram") -> str:
    """Create a chart for a specific column."""
    global current_data
    if current_data is None:
        return "No data loaded. Please upload a CSV file first."
    
    if column not in current_data.columns:
        return f"❌ Column '{column}' not found. Available columns: {', '.join(current_data.columns)}"
    
    try:
        try:
            import matplotlib.pyplot as plt
            import base64
            from io import BytesIO
            plt.ioff()
        except ImportError:
            return "❌ Chart creation unavailable: matplotlib not installed. You can still analyze the data using other functions."
        
        fig, ax = plt.subplots(figsize=(10, 6))
        
        if chart_type.lower() == "bar":
            value_counts = current_data[column].value_counts().head(10)
            ax.bar(range(len(value_counts)), value_counts.values, color='#059669')
            ax.set_xticks(range(len(value_counts)))
            ax.set_xticklabels(value_counts.index, rotation=45, ha='right')
            ax.set_ylabel('Count')
            ax.set_title(f'Bar Chart: {column}', fontsize=14, fontweight='bold')
            ax.grid(axis='y', alpha=0.3)
            
        elif chart_type.lower() == "histogram":
            if pd.api.types.is_numeric_dtype(current_data[column]):
                ax.hist(current_data[column].dropna(), bins=20, alpha=0.7, color='#059669', edgecolor='white')
                ax.set_xlabel(column)
                ax.set_ylabel('Frequency')
                ax.set_title(f'Histogram: {column}', fontsize=14, fontweight='bold')
                ax.grid(axis='y', alpha=0.3)
            else:
                plt.close(fig)
                return f"❌ Cannot create histogram for non-numeric column '{column}'. Try 'bar' chart instead."
        else:
            plt.close(fig)
            return f"❌ Unsupported chart type '{chart_type}'. Use: bar or histogram"
        
        plt.tight_layout()
        
        buffer = BytesIO()
        plt.savefig(buffer, format='png', dpi=100, bbox_inches='tight')
        buffer.seek(0)
        image_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
        plt.close(fig)
        
        return f"""✅ Chart created successfully for '{column}' ({chart_type} chart).

<img src="data:image/png;base64,{image_base64}" alt="{chart_type.title()} chart for {column}" style="max-width: 100%; height: auto; margin: 10px 0; border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);">

Chart shows the distribution of values in the '{column}' column."""
        
    except Exception as e:
        return f"❌ Error creating chart: {str(e)}"

@tool
def get_correlation_analysis() -> str:
    """Get correlation analysis for numeric columns."""
    global current_data
    if current_data is None:
        return "No data loaded. Please upload a CSV file first."
    
    numeric_cols = current_data.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) < 2:
        return "❌ Need at least 2 numerical columns to calculate correlations."
    
    corr_matrix = current_data[numeric_cols].corr()
    
    result = ["## Correlation Analysis\n"]
    result.append("### Correlation Matrix:\n")
    
    # Create correlation table using markdown format
    result.append(dataframe_to_markdown(corr_matrix, max_rows=20))
    
    result.append("\n### Key Insights:")
    strong_corr = []
    for i in range(len(numeric_cols)):
        for j in range(i+1, len(numeric_cols)):
            corr_val = corr_matrix.iloc[i, j]
            if abs(corr_val) > 0.7:
                strength = "strong positive" if corr_val > 0 else "strong negative"
                emoji = "📈" if corr_val > 0 else "📉"
                strong_corr.append(f"- {emoji} **{numeric_cols[i]}** and **{numeric_cols[j]}**: {strength} correlation ({corr_val:.3f})")
    
    if strong_corr:
        result.extend(strong_corr)
    else:
        result.append("- ℹ️ No strong correlations found (|r| > 0.7)")
    
    return "\n".join(result)

async def process_user_query(query: str) -> str:
    """Main function to process user queries"""
    
    # Check if provider is 'local' (WebLLM)
    provider = globals().get('PROVIDER', 'openai')
    
    if provider == 'local':
        # Use WebLLM (local in-browser inference) with Nous Research tool calling
        print(f"[Local Mode] Using WebLLM with tool calling")
        
        try:
            # Import async/await support
            from js import webllmEngine
            import json as json_module
            
            # Check if WebLLM engine is loaded
            if not webllmEngine:
                return """❌ WebLLM model not loaded. Please click the "🚀 Load Model" button at the top of the page first.
                
Then you can ask questions like:
- 'Show me a summary of the data'
- 'Create a histogram of sales'
- 'Show correlations between variables'"""
            
            # Define available tools
            tools = [get_data_summary, get_column_info, get_value_counts, create_chart, get_correlation_analysis]
            
            # Create Nous Research system prompt with tool definitions
            # Use template's system_prompt field for domain instructions
            # TEMPLATE_SYSTEM_PROMPT is injected by AgentHTMLGenerator from template.system_prompt
            system_prompt = create_nous_research_system_prompt(
                tools, 
                current_data, 
                current_filename,
                TEMPLATE_SYSTEM_PROMPT  # Gets value from template.system_prompt field
            )
            
            print(f"[WebLLM] Calling model with Nous Research prompt")
            print(f"[WebLLM] User query: {query}")
            
            # Call WebLLM with system prompt and user query
            # Use the bridge function that's already available
            result_json = await call_webllm_from_python(query, system_prompt)
            
            print(f"[WebLLM] Raw response length: {len(result_json)}")
            print(f"[WebLLM] Response preview: {result_json[:200]}")
            
            # Parse tool calls from response
            tool_calls = parse_hermes_tool_calls(result_json)
            
            if tool_calls:
                print(f"[WebLLM] Found {len(tool_calls)} tool calls")
                
                # Execute the first tool call
                tool_call = tool_calls[0]
                tool_name = tool_call['name']
                tool_args = tool_call['args']
                
                print(f"[WebLLM] Executing tool: {tool_name} with args: {tool_args}")
                
                # Find the tool by name
                tool_map = {
                    'get_data_summary': get_data_summary,
                    'get_column_info': get_column_info,
                    'get_value_counts': get_value_counts,
                    'create_chart': create_chart,
                    'get_correlation_analysis': get_correlation_analysis
                }
                
                if tool_name in tool_map:
                    tool = tool_map[tool_name]
                    result = tool.invoke(tool_args)
                    print(f"[WebLLM] Tool execution successful, result length: {len(result)}")
                    return result
                else:
                    return f"❌ Unknown tool: {tool_name}"
            else:
                # No tool calls found - return the raw response
                print(f"[WebLLM] No tool calls found, returning raw response")
                
                # Check if response contains helpful text
                if len(result_json) > 50:
                    return result_json
                else:
                    return """I understand you want to analyze your data, but I couldn't determine which specific tool to use.

Please try asking more specifically:
- 'Show me a summary of the data'
- 'Create a histogram of [column name]'
- 'Show correlations between numeric columns'
- 'Show value counts for [column name]'"""
                    
        except Exception as e:
            print(f"[WebLLM Error] {str(e)}")
            import traceback
            traceback.print_exc()
            
            return f"""❌ Error using WebLLM: {str(e)}

You can still use these direct commands:
- 'Show me a summary' - Dataset overview
- 'Show correlations' - Correlation analysis
- 'Create histogram of [column]' - Visualizations"""
    
    # Cloud provider mode (openai/anthropic) - use AI with tool calling
    try:
        # Get API key from global variable (set by JavaScript)
        api_key = globals().get('current_api_key', '')
        
        # Debug output
        print(f"Debug: API key length: {len(api_key) if api_key else 0}")
        
        if not api_key:
            return "⚠️ Please enter an OpenAI API key to use AI-powered analysis features.\n\nYou can still use basic analysis commands like:\n- 'Show me a summary'\n- 'Show correlation between variables'\n- 'Create a histogram'"
        
        llm = ChatOpenAI(
            model="gpt-4o-mini",
            api_key=api_key
        )
        
        # Test API key with a simple call first
        try:
            test_response = llm.invoke([HumanMessage(content="Hello, respond with just 'API key working'")])
            print(f"Debug: API test response: {test_response.content}")
        except Exception as e:
            return f"❌ API key error: {str(e)}\n\nPlease check your OpenAI API key and try again."
        
        tools = [get_data_summary, get_column_info, get_value_counts, create_chart, get_correlation_analysis]
        llm_with_tools = llm.bind_tools(tools)
        
        # Enhanced prompt to encourage tool usage
        system_prompt = """You are a data analysis assistant with access to specialized tools for analyzing uploaded CSV data. 
        
        IMPORTANT: You have access to these tools - always use them when relevant:
        - get_data_summary(): Get dataset overview, shape, columns, data types
        - get_column_info(): Get detailed information about specific columns  
        - get_value_counts(column): Get value distribution for a column
        - create_chart(column, chart_type): Create visualizations (histogram, bar)
        - get_correlation_analysis(): Analyze correlations between numeric columns
        
        When the user asks for analysis, charts, or insights, USE THESE TOOLS rather than providing generic code examples.
        
        Current request: {query}"""
        
        messages = [HumanMessage(content=system_prompt.format(query=query))]
        
        for _ in range(3):  # Max 3 iterations
            response = llm_with_tools.invoke(messages)
            messages.append(response)
            
            print(f"Debug: Response has {len(response.tool_calls) if response.tool_calls else 0} tool calls")
            
            if not response.tool_calls:
                break
            
            for tool_call in response.tool_calls:
                try:
                    tool_name = tool_call['name']
                    tool_args = tool_call['args']
                    
                    # Find and execute the tool
                    tool_map = {
                        'get_data_summary': get_data_summary,
                        'get_column_info': get_column_info,
                        'get_value_counts': get_value_counts,
                        'create_chart': create_chart,
                        'get_correlation_analysis': get_correlation_analysis
                    }
                    
                    if tool_name in tool_map:
                        tool = tool_map[tool_name]
                        tool_result = tool.invoke(tool_args)
                        messages.append(ToolMessage(content=tool_result, tool_call_id=tool_call['id']))
                    
                    print(f"[OK] Tool {tool_name} executed successfully")
                except Exception as e:
                    error_msg = f"Tool execution error: {str(e)}"
                    messages.append(ToolMessage(content=error_msg, tool_call_id=tool_call['id']))
                    print(f"[ERROR] Tool {tool_name} failed: {error_msg}")
        
            # Return the last tool result directly without another API call
            # This prevents the browser from freezing
            tool_results = []
            for msg in messages:
                if isinstance(msg, ToolMessage):
                    tool_results.append(msg.content)
        
            if tool_results:
                print(f"[OK] Returning tool results directly, count: {len(tool_results)}")
                return tool_results[-1]  # Return last tool result (e.g., chart HTML)
        
        # If no tools were executed, get AI response
        print("[PROCESSING] Getting final response from AI...")
        final_response = llm.invoke(messages)
        print(f"[OK] Final response received, length: {len(final_response.content)}")
        return final_response.content

    except Exception as e:
        return f"❌ Error processing query: {str(e)}"


def load_csv_data(csv_content: str, filename: str = "data.csv"):
    """Load CSV data into global variable"""
    global current_data, current_filename
    try:
        from io import StringIO
        current_data = pd.read_csv(StringIO(csv_content))
        current_filename = filename
        return f"✅ Loaded {current_data.shape[0]} rows and {current_data.shape[1]} columns from {filename}"
    except Exception as e:
        return f"❌ Error loading CSV: {str(e)}"