ts6-grafana/exporter.py

"""
TeamSpeak 6 Prometheus Exporter.

Polls the TS6 WebQuery HTTP API and exposes metrics
in Prometheus format on /metrics endpoint.
"""

import os
import sys
import time
import signal
import logging
from prometheus_client import (
    start_http_server,
    Gauge,
    Counter,
    Info,
    REGISTRY,
    GC_COLLECTOR,
    PLATFORM_COLLECTOR,
    PROCESS_COLLECTOR,
)
from ts6_client import TS6Client

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
logging.basicConfig(
    level=getattr(logging, LOG_LEVEL, logging.INFO),
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("ts6_exporter")

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
TS6_HOST = os.getenv("TS6_HOST", "localhost")
TS6_QUERY_PORT = int(os.getenv("TS6_QUERY_PORT", "10080"))
TS6_API_KEY = os.getenv("TS6_API_KEY", "")
TS6_SERVER_ID = int(os.getenv("TS6_SERVER_ID", "1"))
EXPORTER_PORT = int(os.getenv("EXPORTER_PORT", "9189"))
POLL_INTERVAL = int(os.getenv("POLL_INTERVAL", "15"))
METRIC_PREFIX = os.getenv("METRIC_PREFIX", "ts6")

if not TS6_API_KEY:
    logger.error("TS6_API_KEY environment variable is required!")
    sys.exit(1)

# ---------------------------------------------------------------------------
# Optionally remove default Python collectors for cleaner output
# ---------------------------------------------------------------------------
DISABLE_DEFAULT_COLLECTORS = os.getenv("DISABLE_DEFAULT_COLLECTORS", "true").lower() == "true"
if DISABLE_DEFAULT_COLLECTORS:
    for collector in [GC_COLLECTOR, PLATFORM_COLLECTOR, PROCESS_COLLECTOR]:
        try:
            REGISTRY.unregister(collector)
        except Exception:
            pass

# ---------------------------------------------------------------------------
# Prometheus Metrics
# ---------------------------------------------------------------------------

# Server
server_up = Gauge(f"{METRIC_PREFIX}_server_up", "Whether the TS6 server is reachable (1=up, 0=down)")
server_uptime = Gauge(f"{METRIC_PREFIX}_server_uptime_seconds", "Server uptime in seconds")
server_version_info = Info(f"{METRIC_PREFIX}_server_version", "TeamSpeak server version info")

# Clients
clients_online = Gauge(f"{METRIC_PREFIX}_clients_online", "Number of clients currently online (excluding query clients)")
clients_max = Gauge(f"{METRIC_PREFIX}_clients_max", "Maximum allowed clients")
query_clients_online = Gauge(f"{METRIC_PREFIX}_query_clients_online", "Number of query clients online")

# Channels
channels_total = Gauge(f"{METRIC_PREFIX}_channels_total", "Total number of channels")

# Bandwidth
bytes_sent = Gauge(f"{METRIC_PREFIX}_bytes_sent_total", "Total bytes sent by the server")
bytes_received = Gauge(f"{METRIC_PREFIX}_bytes_received_total", "Total bytes received by the server")
packets_sent = Gauge(f"{METRIC_PREFIX}_packets_sent_total", "Total packets sent")
packets_received = Gauge(f"{METRIC_PREFIX}_packets_received_total", "Total packets received")

# File Transfer
ft_bytes_sent = Gauge(f"{METRIC_PREFIX}_file_transfer_bytes_sent_total", "Total file transfer bytes sent")
ft_bytes_received = Gauge(f"{METRIC_PREFIX}_file_transfer_bytes_received_total", "Total file transfer bytes received")

# Quality
avg_ping = Gauge(f"{METRIC_PREFIX}_average_ping_seconds", "Average client ping in seconds")
avg_packet_loss = Gauge(f"{METRIC_PREFIX}_average_packet_loss", "Average client packet loss ratio")

# Bans
bans_total = Gauge(f"{METRIC_PREFIX}_bans_total", "Total number of active bans")

# Server Groups
server_groups_total = Gauge(f"{METRIC_PREFIX}_server_groups_total", "Total number of server groups")

# Per-client info (using labels)
client_info_gauge = Gauge(
    f"{METRIC_PREFIX}_client_connected",
    "Connected client info (1 = connected)",
    ["client_id", "nickname", "platform", "version", "country", "channel_id"],
)

# Scrape metrics
scrape_duration = Gauge(f"{METRIC_PREFIX}_scrape_duration_seconds", "Duration of the last scrape in seconds")
scrape_errors = Counter(f"{METRIC_PREFIX}_scrape_errors_total", "Total number of scrape errors")

# ---------------------------------------------------------------------------
# Collector
# ---------------------------------------------------------------------------

def safe_int(value, default=0):
    """Safely convert a value to int."""
    try:
        return int(value)
    except (ValueError, TypeError):
        return default


def safe_float(value, default=0.0):
    """Safely convert a value to float."""
    try:
        return float(value)
    except (ValueError, TypeError):
        return default


def collect_metrics(client: TS6Client):
    """Collect all metrics from the TS6 server."""
    start_time = time.time()

    try:
        # Check if server is alive
        alive = client.is_alive()
        server_up.set(1 if alive else 0)

        if not alive:
            logger.warning("TS6 server is not reachable")
            scrape_errors.inc()
            return

        # Server version (only needs to be set once, but low cost)
        try:
            ver = client.version()
            if ver:
                server_version_info.info({
                    "version": ver.get("version", "unknown"),
                    "build": str(ver.get("build", "unknown")),
                    "platform": ver.get("platform", "unknown"),
                })
        except Exception as e:
            logger.debug("Could not get version: %s", e)

        # Server info
        try:
            info = client.server_info()
            if info:
                server_uptime.set(safe_int(info.get("virtualserver_uptime", 0)))
                clients_online.set(safe_int(info.get("virtualserver_clientsonline", 0)) - safe_int(info.get("virtualserver_queryclientsonline", 0)))
                clients_max.set(safe_int(info.get("virtualserver_maxclients", 0)))
                query_clients_online.set(safe_int(info.get("virtualserver_queryclientsonline", 0)))
                channels_total.set(safe_int(info.get("virtualserver_channelsonline", 0)))

                # Bandwidth
                bytes_sent.set(safe_int(info.get("connection_bytes_sent_total", 0)))
                bytes_received.set(safe_int(info.get("connection_bytes_received_total", 0)))
                packets_sent.set(safe_int(info.get("connection_packets_sent_total", 0)))
                packets_received.set(safe_int(info.get("connection_packets_received_total", 0)))

                # File transfer
                ft_bytes_sent.set(safe_int(info.get("connection_filetransfer_bytes_sent_total", 0)))
                ft_bytes_received.set(safe_int(info.get("connection_filetransfer_bytes_received_total", 0)))

                # Quality
                avg_ping.set(safe_float(info.get("virtualserver_total_ping", 0.0)) / 1000.0)
                avg_packet_loss.set(safe_float(info.get("virtualserver_total_packetloss_total", 0.0)))
        except Exception as e:
            logger.error("Error collecting server info: %s", e)
            scrape_errors.inc()

        # Bans
        try:
            bans = client.ban_list()
            bans_total.set(len(bans))
        except Exception as e:
            logger.debug("Could not get ban list: %s", e)
            bans_total.set(0)

        # Server groups
        try:
            groups = client.server_group_list()
            server_groups_total.set(len(groups))
        except Exception as e:
            logger.debug("Could not get server groups: %s", e)

        # Per-client metrics
        try:
            # Clear previous client labels
            client_info_gauge._metrics.clear()

            clients = client.client_list()
            for c in clients:
                # Skip query clients (client_type=1)
                if safe_int(c.get("client_type", 0)) == 1:
                    continue

                client_info_gauge.labels(
                    client_id=c.get("clid", ""),
                    nickname=c.get("client_nickname", "unknown"),
                    platform=c.get("client_platform", "unknown"),
                    version=c.get("client_version", "unknown"),
                    country=c.get("client_country", ""),
                    channel_id=c.get("cid", ""),
                ).set(1)
        except Exception as e:
            logger.debug("Could not get client list: %s", e)

    except Exception as e:
        logger.error("Unexpected error during collection: %s", e)
        server_up.set(0)
        scrape_errors.inc()

    finally:
        duration = time.time() - start_time
        scrape_duration.set(duration)
        logger.info(
            "Scrape completed in %.3fs | clients=%s channels=%s",
            duration,
            clients_online._value.get() if hasattr(clients_online._value, 'get') else '?',
            channels_total._value.get() if hasattr(channels_total._value, 'get') else '?',
        )


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    logger.info("=" * 60)
    logger.info("TeamSpeak 6 Prometheus Exporter")
    logger.info("=" * 60)
    logger.info("TS6 Host:       %s:%s", TS6_HOST, TS6_QUERY_PORT)
    logger.info("Server ID:      %s", TS6_SERVER_ID)
    logger.info("Exporter Port:  %s", EXPORTER_PORT)
    logger.info("Poll Interval:  %ss", POLL_INTERVAL)
    logger.info("=" * 60)

    client = TS6Client(
        host=TS6_HOST,
        port=TS6_QUERY_PORT,
        api_key=TS6_API_KEY,
        server_id=TS6_SERVER_ID,
    )

    # Start Prometheus HTTP server
    start_http_server(EXPORTER_PORT)
    logger.info("Metrics server started on http://0.0.0.0:%s/metrics", EXPORTER_PORT)

    # Graceful shutdown
    running = True

    def shutdown(signum, frame):
        nonlocal running
        logger.info("Received signal %s, shutting down...", signum)
        running = False

    signal.signal(signal.SIGTERM, shutdown)
    signal.signal(signal.SIGINT, shutdown)

    # Main polling loop
    while running:
        collect_metrics(client)
        time.sleep(POLL_INTERVAL)

    logger.info("Exporter stopped.")


if __name__ == "__main__":
    main()