1. Import Required Libraries¶

In [233]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

2. Connect to SQLite Database¶

In [234]:
conn = sqlite3.connect('Company.db')
cursor = conn.cursor()
print("Successfully connected to Database")
Successfully connected to Database

3. Explore Database Schema¶

In [235]:
tables_query = """
SELECT name FROM sqlite_master 
WHERE type='table' 
ORDER BY name;
"""
tables_df = pd.read_sql_query(tables_query, conn)
print("Tables in database:")
print(tables_df)
Tables in database:
                 name
0            Customer
1          Department
2   DepartmentManager
3          Dependants
4            Employee
5           Inventory
6           Locations
7        OrderDetails
8              Orders
9             Product
10      ProjectMember
11           Projects
12          Warehouse
13             people
14    sqlite_sequence

4. Analysis 1: Top 10 Project Workers¶

In [236]:
project_workers_query = """
SELECT
    e.UserName AS Employee_Name,
    SUM(pm.Hours_Worked) AS Total_Hours_Worked
FROM
    ProjectMember pm
JOIN
    Employee e ON pm.Employee_ID = e.Employee_ID
GROUP BY
    e.UserName
ORDER BY
    Total_Hours_Worked DESC
LIMIT 10;
"""

workers_df = pd.read_sql_query(project_workers_query, conn)

plt.figure(figsize=(12, 6))
bars = plt.barh(workers_df['Employee_Name'], workers_df['Total_Hours_Worked'], color='coral')
plt.xlabel('Total Hours Worked')
plt.ylabel('Employee Name')
plt.title('Top 10 Project Workers by Total Hours')
plt.gca().invert_yaxis()

# Add data labels
for i, (bar, value) in enumerate(zip(bars, workers_df['Total_Hours_Worked'])):
    plt.text(value + 50, bar.get_y() + bar.get_height()/2, f'{int(value)}', 
             va='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()
No description has been provided for this image

5. Analysis 2: Department Manager Tenure¶

In [237]:
manager_tenure_query = """
SELECT
    e.UserName AS Manager_Name,
    d.Department_Name,
    CAST((julianday(COALESCE(dm.End_Date, date('now'))) - julianday(dm.Start_Date)) AS INTEGER) AS Days_as_Manager
FROM
    DepartmentManager dm
JOIN
    Employee e ON dm.Employee_ID = e.Employee_ID
JOIN
    Department d ON dm.Department_ID = d.Department_ID
ORDER BY
    Days_as_Manager DESC;
"""

managers_df = pd.read_sql_query(manager_tenure_query, conn)

plt.figure(figsize=(12, 6))
top_managers = managers_df.head(10)
bars = plt.bar(range(len(top_managers)), top_managers['Days_as_Manager'], color='darkgreen')
plt.xlabel('Manager')
plt.ylabel('Days as Manager')
plt.title('Top 10 Longest Serving Department Managers')
plt.xticks(range(len(top_managers)), top_managers['Manager_Name'], rotation=45, ha='right')

# Add data labels
for i, (bar, value) in enumerate(zip(bars, top_managers['Days_as_Manager'])):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20, f'{int(value)}', 
             ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()
No description has been provided for this image

6. Analysis 3: Employee Count Over Time¶

In [238]:
employee_count_query = """
SELECT
    event_date,
    SUM(daily_net_change) OVER (ORDER BY event_date) AS employee_count
FROM (
    SELECT
        DATE(event_date) AS event_date,
        SUM(change) AS daily_net_change
    FROM (
        SELECT Start_Date AS event_date, 1 AS change FROM Employee
        UNION ALL
        SELECT Leaving_Date AS event_date, -1 AS change FROM Employee WHERE Leaving_Date IS NOT NULL
    ) all_events
    GROUP BY
        DATE(event_date)
) daily_changes
ORDER BY
    event_date;
"""

emp_count_df = pd.read_sql_query(employee_count_query, conn)
emp_count_df['event_date'] = pd.to_datetime(emp_count_df['event_date'])

plt.figure(figsize=(14, 6))
plt.plot(emp_count_df['event_date'], emp_count_df['employee_count'], linewidth=2, color='navy')
plt.fill_between(emp_count_df['event_date'], emp_count_df['employee_count'], alpha=0.3)
plt.xlabel('Date')
plt.ylabel('Employee Count')
plt.title('Employee Headcount Over Time')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
No description has been provided for this image

7. Analysis 4: Warehouse Inventory Value¶

In [239]:
warehouse_query = """
SELECT
    w.Address AS Warehouse_Address,
    SUM(i.Quantity * p.Price) AS Total_Inventory_Value
FROM
    Warehouse w
JOIN
    Inventory i ON w.Warehouse_ID = i.Warehouse_ID
JOIN
    Product p ON i.Product_ID = p.Product_ID
GROUP BY
    w.Address
ORDER BY
    Total_Inventory_Value DESC;
"""

warehouse_df = pd.read_sql_query(warehouse_query, conn)

plt.figure(figsize=(12, 6))
bars = plt.bar(warehouse_df['Warehouse_Address'], warehouse_df['Total_Inventory_Value'], color='purple')
plt.xlabel('Warehouse')
plt.ylabel('Total Inventory Value (£)')
plt.title('Warehouse Inventory Values')
plt.xticks(rotation=45, ha='right')
plt.ticklabel_format(style='plain', axis='y')

# Add data labels
for i, (bar, value) in enumerate(zip(bars, warehouse_df['Total_Inventory_Value'])):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 500, f'£{int(value):,}', 
             ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()
No description has been provided for this image

8. Analysis 5: Salary vs Tenure Analysis¶

In [240]:
salary_tenure_query = """
SELECT
    UserName AS Employee_Name,
    CAST((julianday(COALESCE(Leaving_Date, date('now'))) - julianday(Start_Date)) * 12 / 365 AS INTEGER) AS Months_Worked,
    Salary
FROM
    Employee;
"""

salary_df = pd.read_sql_query(salary_tenure_query, conn)
print("Salary vs Tenure Statistics:")
print(salary_df.describe())

plt.figure(figsize=(12, 6))
plt.scatter(salary_df['Months_Worked'], salary_df['Salary'], alpha=0.6, s=50, color='darkred')
plt.xlabel('Months Worked')
plt.ylabel('Salary (£)')
plt.title('Employee Salary vs Tenure')
plt.grid(True, alpha=0.3)

z = np.polyfit(salary_df['Months_Worked'], salary_df['Salary'], 1)
p = np.poly1d(z)
plt.plot(salary_df['Months_Worked'], p(salary_df['Months_Worked']), "r--", alpha=0.8, linewidth=2, label='Trend')
plt.legend()
plt.tight_layout()
plt.show()
Salary vs Tenure Statistics:
       Months_Worked        Salary
count     500.000000    500.000000
mean       36.840000  63429.194000
std        17.469768  21401.279064
min         3.000000  25000.000000
25%        22.750000  44384.500000
50%        36.000000  63761.000000
75%        50.000000  81669.750000
max        70.000000  99995.000000
No description has been provided for this image

9. Analysis 6: Product Inventory Distribution Heatmap¶

In [241]:
inventory_heatmap_query = """
SELECT
    p.Product_Name,
    w.Address AS Warehouse_Address,
    i.Quantity
FROM
    Product p
INNER JOIN
    Inventory i ON p.Product_ID = i.Product_ID
INNER JOIN
    Warehouse w ON i.Warehouse_ID = w.Warehouse_ID
ORDER BY
    p.Product_Name,
    w.Address;
"""

inventory_df = pd.read_sql_query(inventory_heatmap_query, conn)
inventory_pivot = inventory_df.pivot(index='Product_Name', columns='Warehouse_Address', values='Quantity')

plt.figure(figsize=(14, 8))
sns.heatmap(inventory_pivot, annot=True, fmt='g', cmap='YlOrRd', linewidths=0.5, cbar_kws={'label': 'Quantity'})
plt.title('Product Inventory Distribution Across Warehouses')
plt.xlabel('Warehouse')
plt.ylabel('Product')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
No description has been provided for this image

10. Analysis 7: Department Distribution¶

In [242]:
department_query = """
SELECT
    d.Department_Name,
    COUNT(e.Employee_ID) AS Employee_Count
FROM
    Department d
LEFT JOIN
    Employee e ON d.Department_ID = e.Department_ID
WHERE
    e.Leaving_Date IS NULL
GROUP BY
    d.Department_Name
ORDER BY
    Employee_Count DESC;
"""

dept_df = pd.read_sql_query(department_query, conn)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

ax1.pie(dept_df['Employee_Count'], labels=dept_df['Department_Name'], autopct='%1.1f%%', startangle=90, colors=sns.color_palette('Set2'))
ax1.set_title('Employee Distribution by Department (Pie Chart)')

bars = ax2.bar(dept_df['Department_Name'], dept_df['Employee_Count'], color=sns.color_palette('Set2'))
ax2.set_xlabel('Department')
ax2.set_ylabel('Employee Count')
ax2.set_title('Employee Distribution by Department (Bar Chart)')
ax2.tick_params(axis='x', rotation=45)

# Add data labels
for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2, height + 2, f'{int(height)}', 
             ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()
No description has been provided for this image

11. Analysis 8: Department Salary Distribution (Violin Plot)¶

In [243]:
salary_dist_query = """
SELECT
    d.Department_Name,
    e.Salary
FROM
    Employee e
JOIN
    Department d ON e.Department_ID = d.Department_ID
WHERE
    e.Leaving_Date IS NULL;
"""

salary_dist_df = pd.read_sql_query(salary_dist_query, conn)

plt.figure(figsize=(14, 7))
sns.violinplot(data=salary_dist_df, x='Department_Name', y='Salary', palette='muted', inner='box')
plt.xlabel('Department', fontsize=12)
plt.ylabel('Salary (£)', fontsize=12)
plt.title('Salary Distribution by Department (Violin Plot)', fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("Salary Statistics by Department:")
print(salary_dist_df.groupby('Department_Name')['Salary'].describe())
No description has been provided for this image
Salary Statistics by Department:
                  count          mean           std      min       25%  \
Department_Name                                                          
Customer Service   82.0  60822.170732  23455.058137  25841.0  36799.50   
Engineering        93.0  64921.204301  20713.750688  27519.0  46904.00   
Finance            92.0  59923.250000  21687.780356  25833.0  40168.00   
HR                 80.0  62275.787500  21160.592734  25839.0  47540.25   
Sales              88.0  65379.579545  21038.877885  25000.0  50695.50   

                      50%       75%      max  
Department_Name                               
Customer Service  57442.5  82303.00  99993.0  
Engineering       65446.0  81461.00  99995.0  
Finance           59972.5  79137.25  99158.0  
HR                62077.0  82296.50  99158.0  
Sales             65871.5  79774.00  99991.0  

12. Analysis 9: Location Employee Distribution (Treemap)¶

In [244]:
treemap_query = """
SELECT
    d.Department_Name,
    l.Address AS Location_Address,
    COUNT(e.Employee_ID) AS Employee_Count
FROM
    Department d
JOIN
    Locations l ON d.Department_ID = l.Department_ID
LEFT JOIN
    Employee e ON l.Location_ID = e.Location_ID
GROUP BY
    d.Department_Name, l.Address
HAVING Employee_Count > 0
ORDER BY
    d.Department_Name, Employee_Count DESC;
"""

treemap_df = pd.read_sql_query(treemap_query, conn)

# Create a simple treemap using squarify
import squarify

plt.figure(figsize=(16, 10))
colors = sns.color_palette('Set3', len(treemap_df))

squarify.plot(sizes=treemap_df['Employee_Count'], 
              label=[f"{row['Department_Name']}\n{row['Location_Address']}\n({row['Employee_Count']} emp)" 
                     for _, row in treemap_df.iterrows()],
              alpha=0.8, 
              color=colors,
              text_kwargs={'fontsize':9, 'weight':'bold'})

plt.title('Employee Distribution Across Departments and Locations (Treemap)', fontsize=16, fontweight='bold', pad=20)
plt.axis('off')
plt.tight_layout()
plt.show()
No description has been provided for this image

13. Analysis 10: Product Order Volume Timeline (Area Chart)¶

In [245]:
product_timeline_query = """
SELECT
    DATE(o.Order_Date) AS Order_Date,
    p.Product_Name,
    SUM(od.Quantity) AS Total_Quantity
FROM
    Orders o
JOIN
    OrderDetails od ON o.Order_ID = od.Order_ID
JOIN
    Product p ON od.Product_ID = p.Product_ID
GROUP BY
    DATE(o.Order_Date), p.Product_Name
ORDER BY
    Order_Date, p.Product_Name;
"""

product_timeline_df = pd.read_sql_query(product_timeline_query, conn)
product_timeline_df['Order_Date'] = pd.to_datetime(product_timeline_df['Order_Date'])

# Pivot for stacked area chart
product_pivot = product_timeline_df.pivot_table(index='Order_Date', columns='Product_Name', values='Total_Quantity', fill_value=0)

plt.figure(figsize=(16, 8))
product_pivot.plot.area(stacked=True, alpha=0.7, figsize=(16, 8), colormap='tab10')
plt.xlabel('Order Date', fontsize=12)
plt.ylabel('Total Quantity Ordered', fontsize=12)
plt.title('Product Order Volume Over Time (Stacked Area Chart)', fontsize=14, fontweight='bold')
plt.legend(title='Product', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
<Figure size 1600x800 with 0 Axes>
No description has been provided for this image

14. Analysis 11: Project Hours by Department (Radar Chart)¶

In [246]:
radar_query = """
SELECT
    d.Department_Name,
    SUM(pm.Hours_Worked) AS Total_Hours
FROM
    Department d
JOIN
    Employee e ON d.Department_ID = e.Department_ID
JOIN
    ProjectMember pm ON e.Employee_ID = pm.Employee_ID
GROUP BY
    d.Department_Name
ORDER BY
    d.Department_Name;
"""

radar_df = pd.read_sql_query(radar_query, conn)

# Create radar chart
categories = radar_df['Department_Name'].tolist()
values = radar_df['Total_Hours'].tolist()

# Close the plot by appending the first value at the end
values += values[:1]
angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))
ax.plot(angles, values, 'o-', linewidth=2, color='#1f77b4', label='Total Project Hours')
ax.fill(angles, values, alpha=0.25, color='#1f77b4')
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, size=11)
ax.set_ylim(0, max(values) * 1.1)
ax.set_title('Total Project Hours by Department (Radar Chart)', size=14, fontweight='bold', pad=20)
ax.grid(True, linestyle='--', alpha=0.7)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))

# Add data labels
for angle, value, category in zip(angles[:-1], radar_df['Total_Hours'].tolist(), categories):
    ax.text(angle, value + max(values) * 0.05, f'{int(value)}', 
            ha='center', va='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()
No description has been provided for this image

15. Analysis 12: Department Manager Timeline (Gantt Chart)¶

In [ ]:
 
In [247]:
manager_timeline_query = """
SELECT
    d.Department_Name,
    e.UserName AS Manager_Name,
    dm.Start_Date,
    COALESCE(dm.End_Date, date('now')) AS End_Date,
    CASE WHEN dm.End_Date IS NULL THEN 'Current' ELSE 'Past' END AS Status
FROM
    DepartmentManager dm
JOIN
    Employee e ON dm.Employee_ID = e.Employee_ID
JOIN
    Department d ON dm.Department_ID = d.Department_ID
ORDER BY
    d.Department_Name, dm.Start_Date;
"""

timeline_df = pd.read_sql_query(manager_timeline_query, conn)
timeline_df['Start_Date'] = pd.to_datetime(timeline_df['Start_Date'])
timeline_df['End_Date'] = pd.to_datetime(timeline_df['End_Date'])

# Create Gantt chart using matplotlib
fig, ax = plt.subplots(figsize=(16, 10))

# Get unique departments and assign y-positions
departments = timeline_df['Department_Name'].unique()
dept_positions = {dept: i for i, dept in enumerate(departments)}

# Color map for current vs past managers
colors = {'Current': '#2ecc71', 'Past': '#3498db'}

# Plot horizontal bars for each manager period
for idx, row in timeline_df.iterrows():
    dept_pos = dept_positions[row['Department_Name']]
    start = row['Start_Date']
    end = row['End_Date']
    duration = (end - start).days
    
    # Create bar
    bar = ax.barh(dept_pos, duration, left=start, height=0.6, 
                   color=colors[row['Status']], alpha=0.8, edgecolor='black', linewidth=1)
    
    # Add manager name label in the middle of the bar
    mid_date = start + pd.Timedelta(days=duration/2)
    ax.text(mid_date, dept_pos, row['Manager_Name'], 
            ha='center', va='center', fontsize=9, fontweight='bold', color='white')

# Formatting
ax.set_yticks(range(len(departments)))
ax.set_yticklabels(departments)
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Department', fontsize=12)
ax.set_title('Department Manager Timeline Over Time', fontsize=14, fontweight='bold')
ax.grid(True, axis='x', alpha=0.3, linestyle='--')

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=colors['Current'], label='Current Manager'),
                   Patch(facecolor=colors['Past'], label='Past Manager')]
ax.legend(handles=legend_elements, loc='upper right')

# Format x-axis dates
fig.autofmt_xdate()
plt.tight_layout()
plt.show()
No description has been provided for this image

16. Analysis 13: Employee Turnover Analysis (Sankey Diagram)¶

In [248]:
turnover_query = """
SELECT
    d.Department_Name,
    CASE 
        WHEN e.Leaving_Date IS NULL THEN 'Active'
        ELSE 'Left Company'
    END AS Status,
    COUNT(*) AS Employee_Count
FROM
    Employee e
JOIN
    Department d ON e.Department_ID = d.Department_ID
GROUP BY
    d.Department_Name, Status
ORDER BY
    d.Department_Name, Status;
"""

turnover_df = pd.read_sql_query(turnover_query, conn)

# Prepare data for Sankey
import plotly.graph_objects as go

departments = turnover_df['Department_Name'].unique().tolist()
statuses = ['Active', 'Left Company']

# Create node labels
labels = departments + statuses

# Create links
source = []
target = []
value = []

for _, row in turnover_df.iterrows():
    dept_idx = labels.index(row['Department_Name'])
    status_idx = labels.index(row['Status'])
    source.append(dept_idx)
    target.append(status_idx)
    value.append(row['Employee_Count'])

# Create Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color='black', width=0.5),
        label=labels,
        color=['lightblue', 'yellow', 'lavender', 'salmon', 'skyblue', 'orange', 'lightgreen']
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color='rgba(0,0,96,0.2)'
    )
)])

fig.update_layout(
    title_text='Employee Turnover Flow by Department (Sankey Diagram)',
    font_size=12,
    height=600,
    width=1000
)

fig.show()
Employee Turnover Flow Sankey Diagram

17. Close Database Connection¶

In [249]:
cursor.close()
conn.close()
print("Database connection closed successfully")
Database connection closed successfully