ts6-grafana/exporter.py

282 lines
10 KiB
Python

"""
TeamSpeak 6 Prometheus Exporter.
Polls the TS6 WebQuery HTTP API and exposes metrics
in Prometheus format on /metrics endpoint.
"""
import os
import sys
import time
import signal
import logging
from prometheus_client import (
start_http_server,
Gauge,
Counter,
Info,
REGISTRY,
GC_COLLECTOR,
PLATFORM_COLLECTOR,
PROCESS_COLLECTOR,
)
from ts6_client import TS6Client
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
logging.basicConfig(
level=getattr(logging, LOG_LEVEL, logging.INFO),
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("ts6_exporter")
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
TS6_HOST = os.getenv("TS6_HOST", "localhost")
TS6_QUERY_PORT = int(os.getenv("TS6_QUERY_PORT", "10080"))
TS6_API_KEY = os.getenv("TS6_API_KEY", "")
TS6_SERVER_ID = int(os.getenv("TS6_SERVER_ID", "1"))
EXPORTER_PORT = int(os.getenv("EXPORTER_PORT", "9189"))
POLL_INTERVAL = int(os.getenv("POLL_INTERVAL", "15"))
METRIC_PREFIX = os.getenv("METRIC_PREFIX", "ts6")
if not TS6_API_KEY:
logger.error("TS6_API_KEY environment variable is required!")
sys.exit(1)
# ---------------------------------------------------------------------------
# Optionally remove default Python collectors for cleaner output
# ---------------------------------------------------------------------------
DISABLE_DEFAULT_COLLECTORS = os.getenv("DISABLE_DEFAULT_COLLECTORS", "true").lower() == "true"
if DISABLE_DEFAULT_COLLECTORS:
for collector in [GC_COLLECTOR, PLATFORM_COLLECTOR, PROCESS_COLLECTOR]:
try:
REGISTRY.unregister(collector)
except Exception:
pass
# ---------------------------------------------------------------------------
# Prometheus Metrics
# ---------------------------------------------------------------------------
# Server
server_up = Gauge(f"{METRIC_PREFIX}_server_up", "Whether the TS6 server is reachable (1=up, 0=down)")
server_uptime = Gauge(f"{METRIC_PREFIX}_server_uptime_seconds", "Server uptime in seconds")
server_version_info = Info(f"{METRIC_PREFIX}_server_version", "TeamSpeak server version info")
# Clients
clients_online = Gauge(f"{METRIC_PREFIX}_clients_online", "Number of clients currently online (excluding query clients)")
clients_max = Gauge(f"{METRIC_PREFIX}_clients_max", "Maximum allowed clients")
query_clients_online = Gauge(f"{METRIC_PREFIX}_query_clients_online", "Number of query clients online")
# Channels
channels_total = Gauge(f"{METRIC_PREFIX}_channels_total", "Total number of channels")
# Bandwidth
bytes_sent = Gauge(f"{METRIC_PREFIX}_bytes_sent_total", "Total bytes sent by the server")
bytes_received = Gauge(f"{METRIC_PREFIX}_bytes_received_total", "Total bytes received by the server")
packets_sent = Gauge(f"{METRIC_PREFIX}_packets_sent_total", "Total packets sent")
packets_received = Gauge(f"{METRIC_PREFIX}_packets_received_total", "Total packets received")
# File Transfer
ft_bytes_sent = Gauge(f"{METRIC_PREFIX}_file_transfer_bytes_sent_total", "Total file transfer bytes sent")
ft_bytes_received = Gauge(f"{METRIC_PREFIX}_file_transfer_bytes_received_total", "Total file transfer bytes received")
# Quality
avg_ping = Gauge(f"{METRIC_PREFIX}_average_ping_seconds", "Average client ping in seconds")
avg_packet_loss = Gauge(f"{METRIC_PREFIX}_average_packet_loss", "Average client packet loss ratio")
# Bans
bans_total = Gauge(f"{METRIC_PREFIX}_bans_total", "Total number of active bans")
# Server Groups
server_groups_total = Gauge(f"{METRIC_PREFIX}_server_groups_total", "Total number of server groups")
# Per-client info (using labels)
client_info_gauge = Gauge(
f"{METRIC_PREFIX}_client_connected",
"Connected client info (1 = connected)",
["client_id", "nickname", "platform", "version", "country", "channel_id"],
)
# Scrape metrics
scrape_duration = Gauge(f"{METRIC_PREFIX}_scrape_duration_seconds", "Duration of the last scrape in seconds")
scrape_errors = Counter(f"{METRIC_PREFIX}_scrape_errors_total", "Total number of scrape errors")
# ---------------------------------------------------------------------------
# Collector
# ---------------------------------------------------------------------------
def safe_int(value, default=0):
"""Safely convert a value to int."""
try:
return int(value)
except (ValueError, TypeError):
return default
def safe_float(value, default=0.0):
"""Safely convert a value to float."""
try:
return float(value)
except (ValueError, TypeError):
return default
def collect_metrics(client: TS6Client):
"""Collect all metrics from the TS6 server."""
start_time = time.time()
try:
# Check if server is alive
alive = client.is_alive()
server_up.set(1 if alive else 0)
if not alive:
logger.warning("TS6 server is not reachable")
scrape_errors.inc()
return
# Server version (only needs to be set once, but low cost)
try:
ver = client.version()
if ver:
server_version_info.info({
"version": ver.get("version", "unknown"),
"build": str(ver.get("build", "unknown")),
"platform": ver.get("platform", "unknown"),
})
except Exception as e:
logger.debug("Could not get version: %s", e)
# Server info
try:
info = client.server_info()
if info:
server_uptime.set(safe_int(info.get("virtualserver_uptime", 0)))
clients_online.set(safe_int(info.get("virtualserver_clientsonline", 0)) - safe_int(info.get("virtualserver_queryclientsonline", 0)))
clients_max.set(safe_int(info.get("virtualserver_maxclients", 0)))
query_clients_online.set(safe_int(info.get("virtualserver_queryclientsonline", 0)))
channels_total.set(safe_int(info.get("virtualserver_channelsonline", 0)))
# Bandwidth
bytes_sent.set(safe_int(info.get("connection_bytes_sent_total", 0)))
bytes_received.set(safe_int(info.get("connection_bytes_received_total", 0)))
packets_sent.set(safe_int(info.get("connection_packets_sent_total", 0)))
packets_received.set(safe_int(info.get("connection_packets_received_total", 0)))
# File transfer
ft_bytes_sent.set(safe_int(info.get("connection_filetransfer_bytes_sent_total", 0)))
ft_bytes_received.set(safe_int(info.get("connection_filetransfer_bytes_received_total", 0)))
# Quality
avg_ping.set(safe_float(info.get("virtualserver_total_ping", 0.0)) / 1000.0)
avg_packet_loss.set(safe_float(info.get("virtualserver_total_packetloss_total", 0.0)))
except Exception as e:
logger.error("Error collecting server info: %s", e)
scrape_errors.inc()
# Bans
try:
bans = client.ban_list()
bans_total.set(len(bans))
except Exception as e:
logger.debug("Could not get ban list: %s", e)
bans_total.set(0)
# Server groups
try:
groups = client.server_group_list()
server_groups_total.set(len(groups))
except Exception as e:
logger.debug("Could not get server groups: %s", e)
# Per-client metrics
try:
# Clear previous client labels
client_info_gauge._metrics.clear()
clients = client.client_list()
for c in clients:
# Skip query clients (client_type=1)
if safe_int(c.get("client_type", 0)) == 1:
continue
client_info_gauge.labels(
client_id=c.get("clid", ""),
nickname=c.get("client_nickname", "unknown"),
platform=c.get("client_platform", "unknown"),
version=c.get("client_version", "unknown"),
country=c.get("client_country", ""),
channel_id=c.get("cid", ""),
).set(1)
except Exception as e:
logger.debug("Could not get client list: %s", e)
except Exception as e:
logger.error("Unexpected error during collection: %s", e)
server_up.set(0)
scrape_errors.inc()
finally:
duration = time.time() - start_time
scrape_duration.set(duration)
logger.info(
"Scrape completed in %.3fs | clients=%s channels=%s",
duration,
clients_online._value.get() if hasattr(clients_online._value, 'get') else '?',
channels_total._value.get() if hasattr(channels_total._value, 'get') else '?',
)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
logger.info("=" * 60)
logger.info("TeamSpeak 6 Prometheus Exporter")
logger.info("=" * 60)
logger.info("TS6 Host: %s:%s", TS6_HOST, TS6_QUERY_PORT)
logger.info("Server ID: %s", TS6_SERVER_ID)
logger.info("Exporter Port: %s", EXPORTER_PORT)
logger.info("Poll Interval: %ss", POLL_INTERVAL)
logger.info("=" * 60)
client = TS6Client(
host=TS6_HOST,
port=TS6_QUERY_PORT,
api_key=TS6_API_KEY,
server_id=TS6_SERVER_ID,
)
# Start Prometheus HTTP server
start_http_server(EXPORTER_PORT)
logger.info("Metrics server started on http://0.0.0.0:%s/metrics", EXPORTER_PORT)
# Graceful shutdown
running = True
def shutdown(signum, frame):
nonlocal running
logger.info("Received signal %s, shutting down...", signum)
running = False
signal.signal(signal.SIGTERM, shutdown)
signal.signal(signal.SIGINT, shutdown)
# Main polling loop
while running:
collect_metrics(client)
time.sleep(POLL_INTERVAL)
logger.info("Exporter stopped.")
if __name__ == "__main__":
main()