Python script for analysis of PM2.5 pollution


Python script for analysis of PM2.5 pollution

In my area, there is a permanently installed meter for measuring the PM25 pollution, as other pollutants also. The state, gives free access to these measurements, but doesn't say anything on how to analyze them. So, i figured to ask ChatGPT about it :) and i think it did a good job. I only post this for future reference for me and perhaps somebody else who wants to do the same thing.

You can found the data in this link: https://ypen.gov.gr/perivallon/poiotita-tis-atmosfairas/dedomena-metriseon-atmosfairikis-rypansis/

A sample of data is like this:

"15-12-2024",17,31,17,19,26,14,13,12,16,37,17,13,11,13,14,10,33,27,33,17,35,57,43,27
"16-12-2024",11,8,14,14,11,12,10,10,11,12,13,13,8,8,11,14,13,30,31,51,78,42,93,41
"17-12-2024",66,48,29,28,31,24,37,30,33,58,28,24,19,11,11,8,16,22,65,55,104,121,80,82
"18-12-2024",98,74,47,38,24,23,34,41,36,33,38,42,27,22,19,8,14,18,75,75,45,92,102,38
"19-12-2024",45,60,36,31,20,23,24,33,47,38,36,25,26,13,17,12,16,35,57,69,91,109,101,94
"20-12-2024",89,71,51,33,38,56,38,48,60,63,35,44,21,24,23,32,42,15,11,8,16,18,16,12
"21-12-2024",10,15,12,11,7,9,8,7,13,14,13,10,10,8,10,11,12,11,27,56,33,35,64,57
"22-12-2024",56,62,29,30,15,19,24,28,35,45,38,18,11,11,27,17,13,8,10,16,32,44,48,70
"23-12-2024",46,43,28,18,23,21,18,20,24,21,22,10,12,10,5,7,13,12,12,12,14,12,21,14
"24-12-2024",21,14,9,4,5,8,7,12,9,15,16,14,11,9,6,7,16,31,36,47,26,27,21,16
"25-12-2024",18,19,9,5,5,6,7,8,6,5,3,6,8,8,14,9,8,8,7,4,3,6,13,10
"26-12-2024",8,5,6,4,4,5,3,6,11,10,7,9,12,8,8,10,7,12,8,8,7,6,13,13
"27-12-2024",7,9,7,7,5,3,4,3,4,9,15,14,10,8,10,9,10,14,10,22,15,13,10,7
"28-12-2024",11,8,10,8,4,4,6,6,5,3,8,12,10,8,5,8,19,14,15,13,15,12,10,14
"29-12-2024",8,7,9,11,10,10,12,14,13,16,18,16,25,17,17,18,21,20,24,57,40,31,28,31
"30-12-2024",26,23,22,26,22,15,18,28,23,40,36,16,17,11,11,11,10,12,31,77,38,132,68,78
"31-12-2024",72,61,51,24,40,37,41,43,29,40,72,44,29,15,10,8,11,16,59,84,69,128,109,168

Each line, has the date and 24 values of PM25 levels for each hour of the day. Values with number -9999 mean, that there is no measure for that time and are ignored. Actually converted to Nan value, so the panda library ignore them. The script create graphs in PNG images, that show the averages per month, per year and also a very nice heatmap. Below is the complete script.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os

# --- CONFIG ---
CSV_FILE = "data.csv"
SAVE_DIR = "pm25_graphs"  # Folder to save graphs
# AQI PM2.5 Breakpoints (µg/m³)
AQI_COLORS = [
    (0, 12, "#00E400", "Good"),
    (12.1, 35.4, "#FFFF00", "Moderate"),
    (35.5, 55.4, "#FF7E00", "Unhealthy for Sensitive Groups"),
    (55.5, 150.4, "#FF0000", "Unhealthy"),
    (150.5, 250.4, "#8F3F97", "Very Unhealthy"),
    (250.5, 500, "#7E0023", "Hazardous")
]
# --------------

os.makedirs(SAVE_DIR, exist_ok=True)

# 1. Load data
df = pd.read_csv(CSV_FILE, parse_dates=[0], dayfirst=True)
df.columns = ["date"] + [f"h{i}" for i in range(24)]

# Replace -9999 with NaN
df.replace(-9999, pd.NA, inplace=True)

# Drop rows (days) where all 24 hourly values are NaN
hour_cols = [f"h{i}" for i in range(24)]
df = df.dropna(subset=hour_cols, how="all")

# Convert to long format
df_long = df.melt(id_vars="date", var_name="hour", value_name="pm25")
df_long["hour"] = df_long["hour"].str[1:].astype(int)

# Add time breakdowns
df_long["month"] = df_long["date"].dt.to_period("M")
df_long["year"] = df_long["date"].dt.year
df_long["day_of_year"] = df_long["date"].dt.dayofyear

# 2. Daily average with rolling averages (NaN ignored automatically)
daily_avg = df_long.groupby("date")["pm25"].mean()
daily_avg_7d = daily_avg.rolling(7, min_periods=1).mean()
daily_avg_30d = daily_avg.rolling(30, min_periods=1).mean()

plt.figure(figsize=(15, 5))
plt.plot(daily_avg.index.to_numpy(), daily_avg.to_numpy(), label="Daily Average", alpha=0.5)
plt.plot(daily_avg_7d.index.to_numpy(), daily_avg_7d.to_numpy(), label="7-Day Rolling Avg", linewidth=2)
plt.plot(daily_avg_30d.index.to_numpy(), daily_avg_30d.to_numpy(), label="30-Day Rolling Avg", linewidth=2)
plt.title("Daily PM2.5 Levels (with Rolling Averages)")
plt.ylabel("PM2.5 (µg/m³)")
plt.xlabel("Date")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig(f"{SAVE_DIR}/daily_avg_with_roll.png", dpi=300)
plt.close()

# 3. Monthly average
monthly_avg = df_long.groupby("month")["pm25"].mean()
monthly_avg.index = monthly_avg.index.to_timestamp()

plt.figure(figsize=(15, 5))
plt.plot(monthly_avg.index.to_numpy(), monthly_avg.to_numpy())
plt.title("Monthly Average PM2.5 Levels")
plt.ylabel("PM2.5 (µg/m³)")
plt.xlabel("Month")
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{SAVE_DIR}/monthly_avg.png", dpi=300)
plt.close()

# 4. Yearly average
yearly_avg = df_long.groupby("year")["pm25"].mean()

plt.figure(figsize=(10, 5))
plt.plot(yearly_avg.index.to_numpy(), yearly_avg.to_numpy(), marker="o")
plt.title("Yearly Average PM2.5 Levels")
plt.ylabel("PM2.5 (µg/m³)")
plt.xlabel("Year")
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{SAVE_DIR}/yearly_avg.png", dpi=300)
plt.close()

# 5. Hourly heatmap (average per hour/day of year)
heatmap_data = df_long.pivot_table(
    index="hour",
    columns="day_of_year",
    values="pm25",
    aggfunc="mean"
)

heatmap_data = heatmap_data.sort_index(axis=0).sort_index(axis=1)

plt.figure(figsize=(20, 6))
sns.heatmap(
    heatmap_data.astype(float).to_numpy(),
    cmap="RdYlGn_r",
    cbar_kws={'label': 'PM2.5 (µg/m³)'},
    mask=heatmap_data.isna().to_numpy()  # mask NaNs to hide them
)
plt.title("Hourly PM2.5 Heatmap (Day of Year vs Hour)")
plt.xlabel("Day of Year")
plt.ylabel("Hour of Day")
plt.tight_layout()
plt.savefig(f"{SAVE_DIR}/hourly_heatmap.png", dpi=300)
plt.close()

# 6. Interactive Plotly: Daily avg with AQI coloring
daily_df = daily_avg.reset_index()

def get_aqi_category(pm):
    for low, high, color, name in AQI_COLORS:
        if pd.notna(pm) and low <= pm <= high:
            return name
    return "Beyond Index"

daily_df["AQI Category"] = daily_df["pm25"].apply(get_aqi_category)
fig = px.scatter(
    daily_df, x="date", y="pm25", color="AQI Category",
    title="Daily PM2.5 Levels with AQI Categories",
    labels={"pm25": "PM2.5 (µg/m³)", "date": "Date"},
    color_discrete_map={name: color for _, _, color, name in AQI_COLORS}
)
fig.write_html(f"{SAVE_DIR}/daily_aqi_interactive.html")

print(f"Graphs saved in '{SAVE_DIR}' (PNG + interactive HTML).")

Add a comment

Previous