[{"data":1,"prerenderedAt":5474},["ShallowReactive",2],{"nav-stories":3,"home-projects":61,"home-blog":1219},[4,16,25,34,43,52],{"id":5,"color":6,"extension":7,"image":8,"label":9,"link":10,"meta":11,"order":12,"stem":13,"text":14,"__hash__":15},"stories\u002Fstories\u002F01-data-center.yml",null,"yml","https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1558494949-ef010cbdcc31?w=1080","DATA_CENTER","https:\u002F\u002Fx.com\u002Fabbeytetteh_",{},1,"stories\u002F01-data-center","Racking new servers. 40gbit backbone online.","0QUZQbaANhdO8WemZxkDdO7vbVopfnynHtH9FxBZb_w",{"id":17,"color":6,"extension":7,"image":18,"label":19,"link":6,"meta":20,"order":21,"stem":22,"text":23,"__hash__":24},"stories\u002Fstories\u002F02-thoughts.yml","https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1498050108023-c5249f4df085?w=1080","THOUGHTS",{},2,"stories\u002F02-thoughts","Late night bug hunting. Found the memory leak.","Gd1am954aasY6HRHD7hCtOuessXb6zYZ8iizS501ICg",{"id":26,"color":27,"extension":7,"image":6,"label":28,"link":6,"meta":29,"order":30,"stem":31,"text":32,"__hash__":33},"stories\u002Fstories\u002F03-coding.yml","#3b82f6","CODING",{},3,"stories\u002F03-coding","Just thinking about how much easier life is with Swarm. https:\u002F\u002Fgoogle.com","-WTk-47jnLM-TZRWBg0VbJyZJfIM7FpQ5HGbc8LEdhQ",{"id":35,"color":6,"extension":7,"image":36,"label":37,"link":6,"meta":38,"order":39,"stem":40,"text":41,"__hash__":42},"stories\u002Fstories\u002F04-update.yml","https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1591799264318-7e6ef8ddb7ea?w=1080","UPDATE",{},4,"stories\u002F04-update","New cluster nodes arrived. Prepping for installation.","kyT60N5C6Re_jMonZbgNy0PbQhzXmUWxDbD0D_v43ts",{"id":44,"color":45,"extension":7,"image":6,"label":46,"link":6,"meta":47,"order":48,"stem":49,"text":50,"__hash__":51},"stories\u002Fstories\u002F05-setup.yml","#86868b","SETUP",{},5,"stories\u002F05-setup","Optimizing the telemetry pipeline for 1M req\u002Fs.","cPOBkzoyXsCmPgRO2d80Hj3vm4MP-6nAejtlQ5iuSzw",{"id":53,"color":6,"extension":7,"image":54,"label":55,"link":6,"meta":56,"order":57,"stem":58,"text":59,"__hash__":60},"stories\u002Fstories\u002F06-travel.yml","https:\u002F\u002Fimages.unsplash.com\u002Fphoto-1560969184-10fe8719e047?w=1080","TRAVEL",{},6,"stories\u002F06-travel","Travel log — system architecture workshop in Berlin.","jnOxerdF6usAIHdR35Z-opx0LJAy9kZluXnZhtz62Z0",[62,445,522],{"id":63,"title":64,"body":65,"description":433,"extension":434,"liveUrl":6,"meta":435,"navigation":436,"order":21,"path":437,"seo":438,"stack":439,"stem":443,"thumbnail":6,"__hash__":444},"projects\u002Fprojects\u002Flleven-v1.md","Lleven V1: The Genesis",{"type":66,"value":67,"toc":418},"minimark",[68,72,77,85,90,118,122,130,134,137,159,163,178,223,227,251,255,314,322,325,385,389,415],[69,70,71],"p",{},"The first version of Lleven was born out of a simple need: to make sense of Mobile Money (MoMo) statements without the manual hassle. It was a lean, focused tool designed to transform a raw PDF into a \"Wrapped\" experience—much like Spotify, but for your spending.",[73,74,76],"h2",{"id":75},"architecture-overview","Architecture Overview",[69,78,79,80,84],{},"The V1 architecture was a classic ",[81,82,83],"strong",{},"Synchronous Processing"," model. It prioritized simplicity and immediate feedback for small-to-medium statements.",[86,87,89],"h3",{"id":88},"tech-stack","Tech Stack",[91,92,93,100,106,112],"ul",{},[94,95,96,99],"li",{},[81,97,98],{},"Framework:"," FastAPI",[94,101,102,105],{},[81,103,104],{},"Processing:"," Pandas & PDFPlumber",[94,107,108,111],{},[81,109,110],{},"Caching:"," Redis",[94,113,114,117],{},[81,115,116],{},"Security:"," Fernet (AES-128) Encryption for cached data",[73,119,121],{"id":120},"the-processing-engine","The Processing Engine",[69,123,124,125,129],{},"Lleven V1 used a specialized parsing engine built on top of ",[126,127,128],"code",{},"pdfplumber",".",[86,131,133],{"id":132},"_1-validation-logic","1. Validation Logic",[69,135,136],{},"Before processing, the system checked for specific \"magic strings\" in the first page of the PDF to ensure it was a valid MTN MoMo statement. These included:",[91,138,139,144,149,154],{},[94,140,141],{},[126,142,143],{},"MSISDN:",[94,145,146],{},[126,147,148],{},"Time Run:",[94,150,151],{},[126,152,153],{},"TRANSACTION DATE",[94,155,156],{},[126,157,158],{},"ACCOUNT HOLDER NAME:",[86,160,162],{"id":161},"_2-data-extraction","2. Data Extraction",[69,164,165,166,169,170,173,174,177],{},"The engine targeted tables with a ",[126,167,168],{},"vertical_strategy"," and ",[126,171,172],{},"horizontal_strategy"," set to ",[126,175,176],{},"\"lines\"",". It mapped raw PDF columns to a structured internal format:",[91,179,180,213],{},[94,181,182,185,186,188,189,188,192,188,195,188,198,188,201,188,204,188,207,188,210,129],{},[81,183,184],{},"Raw Mapping:"," ",[126,187,153],{},", ",[126,190,191],{},"FROM ACCT",[126,193,194],{},"FROM NO.",[126,196,197],{},"TRANS. TYPE",[126,199,200],{},"AMOUNT",[126,202,203],{},"TO NO.",[126,205,206],{},"TO NAME",[126,208,209],{},"REF",[126,211,212],{},"OVA",[94,214,215,218,219,222],{},[81,216,217],{},"Cleaning:"," It used regex to identify the ",[126,220,221],{},"ACCOUNT_HOLDER_NO"," from the header text to distinguish between incoming and outgoing funds.",[86,224,226],{"id":225},"_3-data-cleaning-pipeline","3. Data Cleaning Pipeline",[91,228,229,239,245],{},[94,230,231,234,235,238],{},[81,232,233],{},"Date Normalization:"," Converted string dates (e.g., ",[126,236,237],{},"21-May-2023 10:30:00 AM",") into proper Python datetime objects.",[94,240,241,244],{},[81,242,243],{},"Type Casting:"," Converted currency strings into floats for arithmetic operations.",[94,246,247,250],{},[81,248,249],{},"Normalization:"," Removed newline characters and extra whitespace from names and references using custom regex cleaning.",[73,252,254],{"id":253},"the-upload-workflow","The Upload Workflow",[256,257,258,267,277,283,305],"ol",{},[94,259,260,263,264,129],{},[81,261,262],{},"Request:"," User uploads a PDF to ",[126,265,266],{},"\u002Fprocess-file",[94,268,269,272,273,276],{},[81,270,271],{},"Deduplication:"," A SHA-256 hash of the file is generated. If the hash exists in Redis, the system returns the existing ",[126,274,275],{},"file_hash"," immediately.",[94,278,279,282],{},[81,280,281],{},"Parsing:"," If new, the system extracts the table, cleans the data, and caps it to the requested year (e.g., 2023).",[94,284,285,288],{},[81,286,287],{},"Encrypted Caching:",[91,289,290,296,302],{},[94,291,292,293,129],{},"The resulting DataFrame is serialized using ",[126,294,295],{},"pickle",[94,297,298,299,129],{},"It is then encrypted using ",[81,300,301],{},"Fernet (symmetric encryption)",[94,303,304],{},"The encrypted blob is stored in Redis with a 1-hour TTL.",[94,306,307,310,311,313],{},[81,308,309],{},"Response:"," Returns the ",[126,312,275],{}," and an expiry timestamp.",[73,315,317,318,321],{"id":316},"the-retrieval-workflow-get-wrapped","The Retrieval Workflow (",[126,319,320],{},"\u002Fget-wrapped",")",[69,323,324],{},"When the user requests their \"Wrapped\" results:",[256,326,327,330,333],{},[94,328,329],{},"The system pulls the encrypted blob from Redis.",[94,331,332],{},"It decrypts and deserializes the DataFrame.",[94,334,335,338,339],{},[81,336,337],{},"On-the-Fly Analytics:"," It runs a series of summary algorithms:\n",[91,340,341,360,366,372],{},[94,342,343,346,347,188,350,188,353,356,357,129],{},[81,344,345],{},"Spending Summary:"," Aggregates totals for ",[126,348,349],{},"PAYMENT",[126,351,352],{},"CASH_OUT",[126,354,355],{},"TRANSFER",", and ",[126,358,359],{},"DEBIT",[94,361,362,365],{},[81,363,364],{},"Frequency Analysis:"," Calculates the top 5 recipients by amount and frequency.",[94,367,368,371],{},[81,369,370],{},"Monthly Trends:"," Groups transactions by month to visualize spending patterns.",[94,373,374,377,378,380,381,384],{},[81,375,376],{},"Credit Summary:"," Identifies salary or incoming transfers by filtering for the user's ",[126,379,221],{}," in the ",[126,382,383],{},"TO_NO"," column.",[73,386,388],{"id":387},"limitations-of-v1","Limitations of V1",[91,390,391,397,403,409],{},[94,392,393,396],{},[81,394,395],{},"The \"Timeout\" Wall:"," Large PDFs (50+ pages) often caused HTTP timeouts because the API waited for the entire extraction to finish before responding.",[94,398,399,402],{},[81,400,401],{},"Memory Pressure:"," Since processing happened on the API workers, high concurrent uploads could lead to OOM (Out of Memory) errors.",[94,404,405,408],{},[81,406,407],{},"Stateless Persistence:"," Data only lived in Redis. If the cache expired, the user had to re-upload the file.",[94,410,411,414],{},[81,412,413],{},"Lack of Identity:"," No user accounts meant users couldn't see a history of their past uploads without keeping the file hashes themselves.",[69,416,417],{},"Lleven V1 proved the concept, but the stage was set for a more robust, scalable, and secure V2.",{"title":419,"searchDepth":21,"depth":21,"links":420},"",[421,424,429,430,432],{"id":75,"depth":21,"text":76,"children":422},[423],{"id":88,"depth":30,"text":89},{"id":120,"depth":21,"text":121,"children":425},[426,427,428],{"id":132,"depth":30,"text":133},{"id":161,"depth":30,"text":162},{"id":225,"depth":30,"text":226},{"id":253,"depth":21,"text":254},{"id":316,"depth":21,"text":431},"The Retrieval Workflow (\u002Fget-wrapped)",{"id":387,"depth":21,"text":388},"The first version of Lleven, a synchronous processing engine designed to transform raw Mobile Money PDFs into structured spending analytics.","md",{},true,"\u002Fprojects\u002Flleven-v1",{"title":64,"description":433},[440,441,442],"FastAPI","Pandas","Redis","projects\u002Flleven-v1","HUCFXtkqCH7TKZwiJKSCxzJevh-30bUSAoJtS1E_9Mc",{"id":446,"title":447,"body":448,"description":511,"extension":434,"liveUrl":512,"meta":513,"navigation":436,"order":12,"path":514,"seo":515,"stack":516,"stem":520,"thumbnail":6,"__hash__":521},"projects\u002Fprojects\u002Flleven-v2.md","Lleven V2: Scale & Security",{"type":66,"value":449,"toc":506},[450,453,456,460,463,483,486,490,493,496,500,503],[69,451,452],{},"Following the success and the hard lessons learned from the initial prototype, Lleven V2 was completely re-architected. Where V1 hit memory limits and HTTP timeouts with large PDFs, V2 adopts a robust, event-driven architecture designed for scale and security.",[454,455],"reference",{"path":437},[73,457,459],{"id":458},"the-asynchronous-pipeline","The Asynchronous Pipeline",[69,461,462],{},"To solve the \"Timeout Wall\" of V1, the entire parsing engine was decoupled from the HTTP request cycle.",[256,464,465,471,477],{},[94,466,467,470],{},[81,468,469],{},"Upload:"," Files are securely uploaded and placed into an encrypted object store.",[94,472,473,476],{},[81,474,475],{},"Queueing:"," An event is fired to Redis Streams, queuing the document for processing.",[94,478,479,482],{},[81,480,481],{},"Background Workers:"," Dedicated consumer services written in Go pick up the job, run the intensive parsing, and update the database asynchronously.",[69,484,485],{},"This means a user uploading a 200-page statement gets an immediate response, while the system processes the file safely in the background.",[73,487,489],{"id":488},"identity-and-persistence","Identity and Persistence",[69,491,492],{},"A core limitation of V1 was the lack of user identity. In V2, we introduced a robust identity management system. Users can securely authenticate, manage their accounts, and review their processing history.",[69,494,495],{},"Data is no longer strictly ephemeral in a volatile cache; insights are persisted securely, allowing users to track their financial trends over time without re-uploading the same documents.",[73,497,499],{"id":498},"real-time-telemetry","Real-time Telemetry",[69,501,502],{},"With the new architecture, we implemented Server-Sent Events (SSE) to push real-time status updates back to the UI. The frontend (rebuilt in Nuxt and Vue) provides a live, reactive experience as the statement goes through validation, extraction, and analytics stages.",[69,504,505],{},"V2 sets a new standard for performance, reliability, and user experience.",{"title":419,"searchDepth":21,"depth":21,"links":507},[508,509,510],{"id":458,"depth":21,"text":459},{"id":488,"depth":21,"text":489},{"id":498,"depth":21,"text":499},"The next generation of the Lleven engine, featuring asynchronous processing, resilient data pipelines, and a secure user identity layer.","https:\u002F\u002Flleven.app",{},"\u002Fprojects\u002Flleven-v2",{"title":447,"description":511},[517,518,519],"Vue","Nuxt","Go","projects\u002Flleven-v2","OqhBSYnbfS0zohtEx0Nj449VSmaWGjiut8rKc8oODdA",{"id":523,"title":524,"body":525,"description":1210,"extension":434,"liveUrl":6,"meta":1211,"navigation":436,"order":30,"path":1212,"seo":1213,"stack":1214,"stem":1217,"thumbnail":6,"__hash__":1218},"projects\u002Fprojects\u002Fproject-overview.md","Iris — Giving LGTM a brain",{"type":66,"value":526,"toc":1187},[527,531,534,537,540,544,547,553,559,565,567,571,575,604,608,625,629,650,654,671,675,692,696,725,729,743,747,764,768,791,795,812,816,830,834,848,852,863,865,869,872,980,983,1003,1005,1009,1020,1025,1073,1083,1086,1094,1097,1099,1103,1129,1131,1135],[73,528,530],{"id":529},"overview","Overview",[69,532,533],{},"Iris is a production-grade infrastructure monitoring platform built for KNUST (Kwame Nkrumah University of Science and Technology). It automates the onboarding of servers into a Prometheus-based observability stack, manages the full lifecycle of infrastructure incidents, and gives the monitoring environment an intelligent layer through AI-powered analysis, automated digests, and operational runbooks.",[69,535,536],{},"The system is named after the Greek goddess of the rainbow and messenger of the gods — fitting for a platform whose job is to relay the state of infrastructure to the people responsible for it.",[538,539],"hr",{},[73,541,543],{"id":542},"what-it-does","What It Does",[69,545,546],{},"At its core, Iris solves a real operational problem: getting dozens (or hundreds) of servers properly instrumented, monitored, and connected to the right people when something goes wrong — without doing it manually each time.",[69,548,549,552],{},[81,550,551],{},"Host Enrollment"," is the entry point. An administrator submits a host — Linux or Windows — and Iris SSH or WinRM's into it, installs the Grafana Alloy metrics agent, deploys a configuration tailored to the services running on that host, registers it in Prometheus service discovery, and sends a confirmation notification. What would take 20 minutes manually takes under 2 minutes, consistently, for every host.",[69,554,555,558],{},[81,556,557],{},"Incident Management"," is where Iris does its most important work. AlertManager fires a webhook when Prometheus rules trigger. Iris receives that webhook, enriches the alert with context from a vector knowledge base (relevant runbooks, similar past incidents, infrastructure documentation), generates an AI-authored notification with recommended remediation steps, routes it to the right team via Microsoft Teams and email, and stores the incident for future learning. Every alert becomes more useful than it would be alone.",[69,560,561,564],{},[81,562,563],{},"Operational Intelligence"," sits on top of all of this. Iris collects metrics snapshots, tracks maintenance windows with full lifecycle management, notifies teams when services are deployed, delivers daily, weekly, and monthly digest reports scoped to each maintainer's specific responsibilities, and continuously builds a richer knowledge base that makes future incidents faster to resolve.",[538,566],{},[73,568,570],{"id":569},"key-features","Key Features",[86,572,574],{"id":573},"automated-host-enrollment","Automated Host Enrollment",[91,576,577,580,583,586,589,592,595,598,601],{},[94,578,579],{},"SSH-based enrollment for Linux hosts (Debian, RHEL, SUSE, and derivatives)",[94,581,582],{},"WinRM-based enrollment for Windows hosts",[94,584,585],{},"Automatic detection of host OS, architecture, and installed services",[94,587,588],{},"Grafana Alloy agent installation with version management",[94,590,591],{},"Jinja2-templated Alloy configurations backed by a database template store",[94,593,594],{},"Support for service-specific configurations: node metrics, Nginx, Apache, MySQL, PostgreSQL, MongoDB, Windows Performance Counters",[94,596,597],{},"Prometheus target file registration with full label management (hostname, job, environment, service type, host ID)",[94,599,600],{},"Validation of prerequisites: disk space, connectivity, systemd availability",[94,602,603],{},"Firewall rule management for metrics ports",[86,605,607],{"id":606},"batch-enrollment","Batch Enrollment",[91,609,610,613,616,619,622],{},[94,611,612],{},"Bulk enrollment via JSON, CSV, or YAML file upload",[94,614,615],{},"Sequential or concurrent execution strategies",[94,617,618],{},"Per-host progress tracking and result storage",[94,620,621],{},"Retry logic for failed hosts",[94,623,624],{},"Downloadable templates for batch file formats",[86,626,628],{"id":627},"prometheus-grafana-alloy-integration","Prometheus & Grafana Alloy Integration",[91,630,631,634,637,640,643],{},[94,632,633],{},"File-based service discovery for Prometheus",[94,635,636],{},"Support for multiple federated Prometheus instances",[94,638,639],{},"Atomic target file updates with Prometheus reload",[94,641,642],{},"Prometheus API querying for target verification and metrics collection",[94,644,645,646,649],{},"Alloy configuration validation using ",[126,647,648],{},"alloy fmt"," on the remote host",[86,651,653],{"id":652},"alertmanager-webhook-processing","AlertManager Webhook Processing",[91,655,656,659,662,665,668],{},[94,657,658],{},"Receives AlertManager v4 webhook payloads",[94,660,661],{},"Asynchronous processing via Celery task queue",[94,663,664],{},"Alert enrichment with runbooks, similar incidents, and infrastructure context",[94,666,667],{},"LLM-generated incident notifications with remediation recommendations",[94,669,670],{},"Team-aware routing based on service and host ownership",[86,672,674],{"id":673},"ai-powered-incident-intelligence-rag","AI-Powered Incident Intelligence (RAG)",[91,676,677,680,683,686,689],{},[94,678,679],{},"Weaviate vector database for semantic storage and retrieval",[94,681,682],{},"Ollama-backed embeddings (nomic-embed-text) and language model (Llama 3)",[94,684,685],{},"Three knowledge collections: runbooks, past incidents, infrastructure documentation",[94,687,688],{},"Semantic search across all collections at incident time",[94,690,691],{},"Continuously growing knowledge base as incidents are processed and resolved",[86,693,695],{"id":694},"host-tagging-system","Host Tagging System",[91,697,698,716,719,722],{},[94,699,700,701,188,704,188,707,188,710,188,713],{},"Operational tags: ",[126,702,703],{},"ignore_alerts",[126,705,706],{},"known_issue",[126,708,709],{},"under_maintenance",[126,711,712],{},"flaky",[126,714,715],{},"custom",[94,717,718],{},"Optional expiry timestamps for temporary tags",[94,720,721],{},"Metadata key-value pairs for custom annotation",[94,723,724],{},"Notification suppression for tagged hosts",[86,726,728],{"id":727},"incident-lifecycle-management","Incident Lifecycle Management",[91,730,731,734,737,740],{},[94,732,733],{},"Full incident storage with status tracking (firing → resolved)",[94,735,736],{},"Resolution notes and root cause documentation",[94,738,739],{},"Filter and query by alert name, instance, severity, service type, status",[94,741,742],{},"Resolution time tracking and SLA flagging in digest reports",[86,744,746],{"id":745},"automated-digests","Automated Digests",[91,748,749,752,755,758,761],{},[94,750,751],{},"Daily, weekly, and monthly digest reports via Celery Beat",[94,753,754],{},"Personalized: each maintainer receives only their hosts and services",[94,756,757],{},"Severity distribution, SLA flags, and resolution status",[94,759,760],{},"Delivered via Microsoft Teams adaptive cards and HTML email",[94,762,763],{},"Stakeholder segmentation: scoped maintainers, service stakeholders, infrastructure stakeholders, general recipients",[86,765,767],{"id":766},"maintenance-window-management","Maintenance Window Management",[91,769,770,773,776,779,782,785,788],{},[94,771,772],{},"Full lifecycle: plan → start → extend → end \u002F cancel",[94,774,775],{},"Types: scheduled and emergency",[94,777,778],{},"Categories: infrastructure, application, network, database, security",[94,780,781],{},"Automated notifications at window start, during, and on completion",[94,783,784],{},"AI-generated summaries of maintenance activities",[94,786,787],{},"Configurable reminder scheduling before planned windows",[94,789,790],{},"Bulk cancel operations",[86,792,794],{"id":793},"deployment-notifications","Deployment Notifications",[91,796,797,800,803,806,809],{},[94,798,799],{},"Deployment event ingestion with version, environment, and service information",[94,801,802],{},"Semantic version comparison (upgrade vs rollback detection)",[94,804,805],{},"AI-generated change summaries from commit messages",[94,807,808],{},"Notifications routed to service maintainers",[94,810,811],{},"Deployment record storage for audit trail",[86,813,815],{"id":814},"metrics-collection-snapshots","Metrics Collection & Snapshots",[91,817,818,821,824,827],{},[94,819,820],{},"Prometheus metrics scraped and stored in PostgreSQL every 15 minutes",[94,822,823],{},"Configurable retention (default 90 days) with automated pruning",[94,825,826],{},"CPU, memory, and disk thresholds with warning and critical levels",[94,828,829],{},"Historical trend data for digest and reporting card generation",[86,831,833],{"id":832},"service-maintainer-registry","Service & Maintainer Registry",[91,835,836,839,842,845],{},[94,837,838],{},"Service registry with maintainer and escalation manager assignments",[94,840,841],{},"Host-level maintainer overrides independent of service assignments",[94,843,844],{},"Notification preference flags per host and service",[94,846,847],{},"Used across enrollment, incident routing, digests, and deployment notifications",[86,849,851],{"id":850},"observability-of-iris-itself","Observability of Iris Itself",[91,853,854,857,860],{},[94,855,856],{},"OpenTelemetry integration for distributed tracing, metrics, and structured logging",[94,858,859],{},"Configurable OTLP export to a collector endpoint",[94,861,862],{},"Per-service tracing across FastAPI routes and Celery tasks",[538,864],{},[73,866,868],{"id":867},"architecture","Architecture",[69,870,871],{},"Iris is built on a modern async Python stack:",[873,874,875,888],"table",{},[876,877,878],"thead",{},[879,880,881,885],"tr",{},[882,883,884],"th",{},"Layer",[882,886,887],{},"Technology",[889,890,891,900,908,916,924,932,940,948,956,964,972],"tbody",{},[879,892,893,897],{},[894,895,896],"td",{},"API Framework",[894,898,899],{},"FastAPI (async)",[879,901,902,905],{},[894,903,904],{},"Task Queue",[894,906,907],{},"Celery with Redis broker",[879,909,910,913],{},[894,911,912],{},"Scheduler",[894,914,915],{},"Celery Beat",[879,917,918,921],{},[894,919,920],{},"Primary Database",[894,922,923],{},"PostgreSQL 16 (via asyncpg \u002F SQLModel)",[879,925,926,929],{},[894,927,928],{},"Vector Database",[894,930,931],{},"Weaviate 1.33",[879,933,934,937],{},[894,935,936],{},"AI \u002F Embeddings",[894,938,939],{},"Ollama (Llama 3, nomic-embed-text)",[879,941,942,945],{},[894,943,944],{},"Remote Execution",[894,946,947],{},"Paramiko (SSH), PyWinRM (WinRM)",[879,949,950,953],{},[894,951,952],{},"Notifications",[894,954,955],{},"Microsoft Teams (webhooks + Graph API), KNUST Email Gateway",[879,957,958,961],{},[894,959,960],{},"Monitoring Agent",[894,962,963],{},"Grafana Alloy",[879,965,966,969],{},[894,967,968],{},"Metrics Source",[894,970,971],{},"Prometheus",[879,973,974,977],{},[894,975,976],{},"Observability",[894,978,979],{},"OpenTelemetry",[69,981,982],{},"The application is split into three cooperating processes:",[91,984,985,991,997],{},[94,986,987,990],{},[81,988,989],{},"iris"," — the FastAPI API server, handling synchronous request\u002Fresponse and dispatching async work",[94,992,993,996],{},[81,994,995],{},"iris-worker"," — one or more Celery worker processes executing enrollment, incident processing, digest generation, maintenance, and deployment tasks",[94,998,999,1002],{},[81,1000,1001],{},"iris-beat"," — the Celery Beat scheduler driving periodic tasks (digests, metrics collection, metric pruning)",[538,1004],{},[73,1006,1008],{"id":1007},"deployment","Deployment",[69,1010,1011,1012,1015,1016,1019],{},"Iris runs on Docker Swarm in production, deployed to KNUST's internal Docker registry (",[126,1013,1014],{},"dreg.knust.edu.gh","). The stack is pinned to a specific monitoring node (",[126,1017,1018],{},"knust-monitoring",") and shares a Docker overlay network with the rest of the monitoring stack (Prometheus, Grafana, AlertManager, Weaviate, Ollama).",[69,1021,1022],{},[81,1023,1024],{},"Production resource allocation:",[873,1026,1027,1040],{},[876,1028,1029],{},[879,1030,1031,1034,1037],{},[882,1032,1033],{},"Container",[882,1035,1036],{},"Memory",[882,1038,1039],{},"CPU",[889,1041,1042,1053,1063],{},[879,1043,1044,1047,1050],{},[894,1045,1046],{},"iris (API)",[894,1048,1049],{},"2 GB limit \u002F 512 MB reserved",[894,1051,1052],{},"1.0 \u002F 0.25",[879,1054,1055,1057,1060],{},[894,1056,995],{},[894,1058,1059],{},"4 GB limit \u002F 1 GB reserved",[894,1061,1062],{},"2.0 \u002F 0.5",[879,1064,1065,1067,1070],{},[894,1066,1001],{},[894,1068,1069],{},"Lightweight",[894,1071,1072],{},"Minimal",[69,1074,1075,1076,1079,1080,129],{},"Prometheus target files are mounted directly from the host at ",[126,1077,1078],{},"\u002Fopt\u002Fmonitoring-stack\u002Fshared\u002Fprometheus\u002Ftargets",", allowing Iris to write target files that Prometheus reads without any additional network hop. Iris configuration files (templates, credentials) are mounted from ",[126,1081,1082],{},"\u002Fopt\u002Fmonitoring-stack\u002Fshared\u002Firis\u002F",[69,1084,1085],{},"The application is built from a two-stage Dockerfile:",[256,1087,1088,1091],{},[94,1089,1090],{},"A builder stage installs Python dependencies from KNUST's private PyPI registry",[94,1092,1093],{},"A slim runtime stage runs the application as a non-root user",[69,1095,1096],{},"Database migrations are managed with Alembic, and schema evolution is handled as part of the deployment pipeline.",[538,1098],{},[73,1100,1102],{"id":1101},"security","Security",[91,1104,1105,1108,1111,1114,1117,1120,1123,1126],{},[94,1106,1107],{},"API key authentication middleware (configurable, disabled in dev)",[94,1109,1110],{},"All secrets externalized to environment variables",[94,1112,1113],{},"SSH private key support for host enrollment (no password storage required)",[94,1115,1116],{},"Azure AD app registration for Microsoft Teams Graph API access",[94,1118,1119],{},"KNUST email gateway uses API key authentication over HTTPS",[94,1121,1122],{},"CORS configured per environment (open in dev, restricted in production)",[94,1124,1125],{},"Non-root container user in production images",[94,1127,1128],{},"Internal Docker network isolation between services",[538,1130],{},[73,1132,1134],{"id":1133},"technology-summary","Technology Summary",[69,1136,1137,1140,1141,1140,1143,1140,1146,1140,1149,1140,1152,1140,1155,1140,1157,1140,1159,1140,1162,1140,1165,1140,1167,1140,1170,1140,1172,1140,1175,1140,1178,1140,1181,1140,1184],{},[81,1138,1139],{},"Python 3.12"," · ",[81,1142,440],{},[81,1144,1145],{},"Celery",[81,1147,1148],{},"PostgreSQL",[81,1150,1151],{},"Weaviate",[81,1153,1154],{},"Ollama",[81,1156,963],{},[81,1158,971],{},[81,1160,1161],{},"AlertManager",[81,1163,1164],{},"Microsoft Teams",[81,1166,979],{},[81,1168,1169],{},"Docker Swarm",[81,1171,442],{},[81,1173,1174],{},"Paramiko",[81,1176,1177],{},"SQLModel",[81,1179,1180],{},"Alembic",[81,1182,1183],{},"Jinja2",[81,1185,1186],{},"LangChain",{"title":419,"searchDepth":21,"depth":21,"links":1188},[1189,1190,1191,1206,1207,1208,1209],{"id":529,"depth":21,"text":530},{"id":542,"depth":21,"text":543},{"id":569,"depth":21,"text":570,"children":1192},[1193,1194,1195,1196,1197,1198,1199,1200,1201,1202,1203,1204,1205],{"id":573,"depth":30,"text":574},{"id":606,"depth":30,"text":607},{"id":627,"depth":30,"text":628},{"id":652,"depth":30,"text":653},{"id":673,"depth":30,"text":674},{"id":694,"depth":30,"text":695},{"id":727,"depth":30,"text":728},{"id":745,"depth":30,"text":746},{"id":766,"depth":30,"text":767},{"id":793,"depth":30,"text":794},{"id":814,"depth":30,"text":815},{"id":832,"depth":30,"text":833},{"id":850,"depth":30,"text":851},{"id":867,"depth":21,"text":868},{"id":1007,"depth":21,"text":1008},{"id":1101,"depth":21,"text":1102},{"id":1133,"depth":21,"text":1134},"A production-grade infrastructure monitoring platform built for KNUST that automates server onboarding and incident management.",{},"\u002Fprojects\u002Fproject-overview",{"title":524,"description":1210},[971,1215,1216],"Grafana","Alloy","projects\u002Fproject-overview","Sn0pP1YAnd1n2VzaB74IfOLY6HyS1GRFqlRJWYlqNO0",[1220,2332,3011,3629,3882,5353],{"id":1221,"title":1222,"body":1223,"category":2319,"date":2320,"description":2321,"extension":434,"meta":2322,"navigation":436,"path":2323,"readTime":2324,"seo":2325,"stem":2326,"tags":2327,"thumbnail":2330,"__hash__":2331},"blog\u002Fblog\u002Flgtm-stack\u002Fpart-1.md","Building a Production Monitoring Stack from Scratch — Part 1: Prometheus, Grafana, Node Exporter & AlertManager",{"type":66,"value":1224,"toc":2307},[1225,1237,1239,1243,1246,1249,1252,1255,1257,1261,1264,1315,1326,1329,1338,1345,1347,1351,1358,1409,1419,1422,1429,1593,1665,1677,1679,1687,1693,1707,1713,1730,1736,1738,1742,1745,1748,1753,1762,1767,1776,1786,1811,1817,1826,1837,1848,1850,1854,1857,1860,2091,2105,2108,2179,2189,2191,2198,2208,2211,2260,2267,2276,2279,2281,2285,2288,2291,2294,2297,2300,2303],[1226,1227,1228],"blockquote",{},[69,1229,1230,1233,1234],{},[81,1231,1232],{},"Series:"," From NagiosXI to a Modern Observability Stack\n",[81,1235,1236],{},"Part 1 of 4",[538,1238],{},[73,1240,1242],{"id":1241},"the-problem-with-nagiosxi","The Problem with NagiosXI",[69,1244,1245],{},"We had been running NagiosXI for a while. It worked, in the way that something can work while also quietly frustrating everyone who touches it. It checked hosts, fired alerts, and we had even wired up scripts to push notifications to Mattermost. But the gaps were real and getting harder to ignore.",[69,1247,1248],{},"It was a paid solution running on our own infrastructure — a licensing cost that got harder to justify every time someone asked for something it couldn't do. OpenTelemetry support was essentially nonexistent. Application log aggregation wasn't on the table at all. Every extension we had made through plugins had taken us about as far as plugins could go.",[69,1250,1251],{},"The conversation about replacing it had been happening for a while. Eventually it stopped being a conversation and became a project. The goal: a full open-source replacement covering host metrics, alerting, log aggregation, and eventually distributed tracing. One cohesive system instead of a patchwork.",[69,1253,1254],{},"I took on the work. Phase 1 was about standing up the foundation and proving it could actually replace what NagiosXI was doing before we went further.",[538,1256],{},[73,1258,1260],{"id":1259},"the-starting-point","The Starting Point",[69,1262,1263],{},"The first week was spent getting four things working together:",[873,1265,1266,1276],{},[876,1267,1268],{},[879,1269,1270,1273],{},[882,1271,1272],{},"Component",[882,1274,1275],{},"Role",[889,1277,1278,1287,1296,1306],{},[879,1279,1280,1284],{},[894,1281,1282],{},[81,1283,971],{},[894,1285,1286],{},"Time-series metrics database and scraping engine",[879,1288,1289,1293],{},[894,1290,1291],{},[81,1292,1215],{},[894,1294,1295],{},"Visualization and dashboarding",[879,1297,1298,1303],{},[894,1299,1300],{},[81,1301,1302],{},"Node Exporter",[894,1304,1305],{},"Host-level metrics (CPU, memory, disk, network)",[879,1307,1308,1312],{},[894,1309,1310],{},[81,1311,1161],{},[894,1313,1314],{},"Alert routing, grouping, and silencing",[69,1316,1317,1318,1321,1322,1325],{},"The deployment runs across two nodes — ",[81,1319,1320],{},"mon-node-a"," for data collection (Prometheus, AlertManager, and agent-side components) and ",[81,1323,1324],{},"mon-node-b"," for presentation (Grafana). Keeping the presentation layer separate from the data layer was a deliberate decision: if we need to update or rebuild Grafana, it doesn't touch Prometheus, and vice versa. Everything runs in Docker.",[69,1327,1328],{},"How these pieces talk to each other matters, because one architectural choice here — pull vs push — ended up being the central problem in Part 2.",[1330,1331,1336],"pre",{"className":1332,"code":1334,"language":1335},[1333],"language-text","[ Linux Hosts ]\n      |\n  node_exporter  (runs on each host, exposes \u002Fmetrics on port 9100)\n      |\n      ↓  (pull — Prometheus reaches out every 15s)\n[ Prometheus ]  ←── scrape_configs + alerting_rules\n      |\n      ├──→ [ AlertManager ]\n      |           |\n      |           └──→ Email \u002F Mattermost\n      |\n[ Grafana ]  ←── queries Prometheus via PromQL\n","text",[126,1337,1334],{"__ignoreMap":419},[69,1339,1340,1341,1344],{},"Prometheus is ",[81,1342,1343],{},"pull-based",". It reaches out to each target on a schedule and pulls metrics. The targets don't know Prometheus exists — they just expose an HTTP endpoint and wait. This distinction ends up mattering a lot.",[538,1346],{},[73,1348,1350],{"id":1349},"getting-host-metrics-in","Getting Host Metrics In",[69,1352,1353,1354,1357],{},"Node Exporter is a lightweight binary that runs on each host and exposes hardware and OS-level metrics at a ",[126,1355,1356],{},"\u002Fmetrics"," HTTP endpoint. Deploy one per machine, point Prometheus at it, done.",[1330,1359,1363],{"className":1360,"code":1361,"language":1362,"meta":419,"style":419},"language-bash shiki shiki-themes vitesse-light","# Verify it's running\ncurl http:\u002F\u002F\u003Chost-ip>:9100\u002Fmetrics | head -50\n","bash",[126,1364,1365,1373],{"__ignoreMap":419},[1366,1367,1369],"span",{"class":1368,"line":12},"line",[1366,1370,1372],{"class":1371},"s8zF2","# Verify it's running\n",[1366,1374,1375,1379,1383,1387,1390,1393,1396,1399,1402,1405],{"class":1368,"line":21},[1366,1376,1378],{"class":1377},"sySUi","curl",[1366,1380,1382],{"class":1381},"spphp"," http:\u002F\u002F",[1366,1384,1386],{"class":1385},"si04Y","\u003C",[1366,1388,1389],{"class":1381},"host-i",[1366,1391,69],{"class":1392},"suHK_",[1366,1394,1395],{"class":1385},">",[1366,1397,1398],{"class":1381},":9100\u002Fmetrics",[1366,1400,1401],{"class":1385}," |",[1366,1403,1404],{"class":1377}," head",[1366,1406,1408],{"class":1407},"sEi1f"," -50\n",[69,1410,1411,1412,169,1415,1418],{},"If you see ",[126,1413,1414],{},"# HELP",[126,1416,1417],{},"# TYPE"," blocks followed by metric lines, you're good.",[69,1420,1421],{},"Getting the metrics in wasn't the hard part. The harder part was getting them in cleanly, with enough context attached that alerts and dashboards would actually be useful. A raw IP address as the target label tells you very little when something breaks at 2am.",[69,1423,1424,1425,1428],{},"The solution was file-based service discovery with rich labels. Instead of listing targets directly in ",[126,1426,1427],{},"prometheus.yml",", Prometheus watches a directory of JSON files:",[1330,1430,1434],{"className":1431,"code":1432,"language":1433,"meta":419,"style":419},"language-json shiki shiki-themes vitesse-light","[\n  {\n    \"targets\": [\"192.168.0.101:9100\"],\n    \"labels\": {\n      \"hostname\": \"web-server-01\",\n      \"environment\": \"production\",\n      \"location\": \"Primary Rack\",\n      \"maintainers\": \"admin@domain.com\"\n    }\n  }\n]\n","json",[126,1435,1436,1442,1447,1477,1491,1514,1534,1555,1575,1581,1587],{"__ignoreMap":419},[1366,1437,1438],{"class":1368,"line":12},[1366,1439,1441],{"class":1440},"sYZai","[\n",[1366,1443,1444],{"class":1368,"line":21},[1366,1445,1446],{"class":1440},"  {\n",[1366,1448,1449,1453,1457,1460,1463,1466,1469,1472,1474],{"class":1368,"line":30},[1366,1450,1452],{"class":1451},"s61at","    \"",[1366,1454,1456],{"class":1455},"su6XF","targets",[1366,1458,1459],{"class":1451},"\"",[1366,1461,1462],{"class":1440},":",[1366,1464,1465],{"class":1440}," [",[1366,1467,1459],{"class":1468},"sSP4y",[1366,1470,1471],{"class":1381},"192.168.0.101:9100",[1366,1473,1459],{"class":1468},[1366,1475,1476],{"class":1440},"],\n",[1366,1478,1479,1481,1484,1486,1488],{"class":1368,"line":39},[1366,1480,1452],{"class":1451},[1366,1482,1483],{"class":1455},"labels",[1366,1485,1459],{"class":1451},[1366,1487,1462],{"class":1440},[1366,1489,1490],{"class":1440}," {\n",[1366,1492,1493,1496,1499,1501,1503,1506,1509,1511],{"class":1368,"line":48},[1366,1494,1495],{"class":1451},"      \"",[1366,1497,1498],{"class":1455},"hostname",[1366,1500,1459],{"class":1451},[1366,1502,1462],{"class":1440},[1366,1504,1505],{"class":1468}," \"",[1366,1507,1508],{"class":1381},"web-server-01",[1366,1510,1459],{"class":1468},[1366,1512,1513],{"class":1440},",\n",[1366,1515,1516,1518,1521,1523,1525,1527,1530,1532],{"class":1368,"line":57},[1366,1517,1495],{"class":1451},[1366,1519,1520],{"class":1455},"environment",[1366,1522,1459],{"class":1451},[1366,1524,1462],{"class":1440},[1366,1526,1505],{"class":1468},[1366,1528,1529],{"class":1381},"production",[1366,1531,1459],{"class":1468},[1366,1533,1513],{"class":1440},[1366,1535,1537,1539,1542,1544,1546,1548,1551,1553],{"class":1368,"line":1536},7,[1366,1538,1495],{"class":1451},[1366,1540,1541],{"class":1455},"location",[1366,1543,1459],{"class":1451},[1366,1545,1462],{"class":1440},[1366,1547,1505],{"class":1468},[1366,1549,1550],{"class":1381},"Primary Rack",[1366,1552,1459],{"class":1468},[1366,1554,1513],{"class":1440},[1366,1556,1558,1560,1563,1565,1567,1569,1572],{"class":1368,"line":1557},8,[1366,1559,1495],{"class":1451},[1366,1561,1562],{"class":1455},"maintainers",[1366,1564,1459],{"class":1451},[1366,1566,1462],{"class":1440},[1366,1568,1505],{"class":1468},[1366,1570,1571],{"class":1381},"admin@domain.com",[1366,1573,1574],{"class":1468},"\"\n",[1366,1576,1578],{"class":1368,"line":1577},9,[1366,1579,1580],{"class":1440},"    }\n",[1366,1582,1584],{"class":1368,"line":1583},10,[1366,1585,1586],{"class":1440},"  }\n",[1366,1588,1590],{"class":1368,"line":1589},11,[1366,1591,1592],{"class":1440},"]\n",[1330,1594,1598],{"className":1595,"code":1596,"language":1597,"meta":419,"style":419},"language-yaml shiki shiki-themes vitesse-light","# prometheus.yml\nscrape_configs:\n  - job_name: \"node_exporter\"\n    file_sd_configs:\n      - files:\n          - \u002Fetc\u002Fprometheus\u002Ftargets\u002F*.json\n        refresh_interval: 30s\n","yaml",[126,1599,1600,1605,1613,1630,1637,1647,1655],{"__ignoreMap":419},[1366,1601,1602],{"class":1368,"line":12},[1366,1603,1604],{"class":1371},"# prometheus.yml\n",[1366,1606,1607,1610],{"class":1368,"line":21},[1366,1608,1609],{"class":1455},"scrape_configs",[1366,1611,1612],{"class":1440},":\n",[1366,1614,1615,1618,1621,1623,1625,1628],{"class":1368,"line":30},[1366,1616,1617],{"class":1440},"  -",[1366,1619,1620],{"class":1455}," job_name",[1366,1622,1462],{"class":1440},[1366,1624,1505],{"class":1468},[1366,1626,1627],{"class":1381},"node_exporter",[1366,1629,1574],{"class":1468},[1366,1631,1632,1635],{"class":1368,"line":39},[1366,1633,1634],{"class":1455},"    file_sd_configs",[1366,1636,1612],{"class":1440},[1366,1638,1639,1642,1645],{"class":1368,"line":48},[1366,1640,1641],{"class":1440},"      -",[1366,1643,1644],{"class":1455}," files",[1366,1646,1612],{"class":1440},[1366,1648,1649,1652],{"class":1368,"line":57},[1366,1650,1651],{"class":1440},"          -",[1366,1653,1654],{"class":1381}," \u002Fetc\u002Fprometheus\u002Ftargets\u002F*.json\n",[1366,1656,1657,1660,1662],{"class":1368,"line":1536},[1366,1658,1659],{"class":1455},"        refresh_interval",[1366,1661,1462],{"class":1440},[1366,1663,1664],{"class":1381}," 30s\n",[69,1666,1667,1668,1671,1672,1676],{},"Drop a file in, get a monitored host within 30 seconds. No reload required. The labels on each target flow through to every metric scraped from that host — which means they're available in alert annotations, in Grafana, everywhere. When ",[126,1669,1670],{},"HostDown"," fires, the alert can say ",[1673,1674,1675],"em",{},"which"," host, in which environment, and who to contact. That's the payoff.",[538,1678],{},[73,1680,1682,1683,1686],{"id":1681},"the-up-metric","The ",[126,1684,1685],{},"up"," Metric",[69,1688,1689,1690,1692],{},"One of Prometheus's built-in synthetic metrics is ",[126,1691,1685],{},". For every scrape target:",[91,1694,1695,1701],{},[94,1696,1697,1700],{},[126,1698,1699],{},"up = 1"," — scrape succeeded",[94,1702,1703,1706],{},[126,1704,1705],{},"up = 0"," — scrape failed",[69,1708,1709,1710,1712],{},"This is the most fundamental health signal in the stack. Everything else — CPU, memory, disk — is meaningless if you can't even reach the host. And because ",[126,1711,1685],{}," carries all the labels from your target file, you can immediately see which host is down, in which environment.",[1330,1714,1718],{"className":1715,"code":1716,"language":1717,"meta":419,"style":419},"language-promql shiki shiki-themes vitesse-light","# All down hosts right now\nup{job=\"node_exporter\"} == 0\n","promql",[126,1719,1720,1725],{"__ignoreMap":419},[1366,1721,1722],{"class":1368,"line":12},[1366,1723,1724],{},"# All down hosts right now\n",[1366,1726,1727],{"class":1368,"line":21},[1366,1728,1729],{},"up{job=\"node_exporter\"} == 0\n",[69,1731,1732,1733,1735],{},"I keep coming back to ",[126,1734,1685],{}," throughout this series because it's also where things can silently break if you change the architecture carelessly. More on that in Part 2.",[538,1737],{},[73,1739,1741],{"id":1740},"dashboards","Dashboards",[69,1743,1744],{},"Grafana connects to Prometheus as a data source and queries it via PromQL. The community dashboards are easy to import and useful for getting started, but building your own is worth doing because it forces you to understand exactly what you're looking at.",[69,1746,1747],{},"The core panels and the queries behind them:",[69,1749,1750],{},[81,1751,1752],{},"CPU Usage (%)",[1330,1754,1756],{"className":1715,"code":1755,"language":1717,"meta":419,"style":419},"100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)\n",[126,1757,1758],{"__ignoreMap":419},[1366,1759,1760],{"class":1368,"line":12},[1366,1761,1755],{},[69,1763,1764],{},[81,1765,1766],{},"Memory Usage (%)",[1330,1768,1770],{"className":1715,"code":1769,"language":1717,"meta":419,"style":419},"(1 - (node_memory_MemAvailable_bytes \u002F node_memory_MemTotal_bytes)) * 100\n",[126,1771,1772],{"__ignoreMap":419},[1366,1773,1774],{"class":1368,"line":12},[1366,1775,1769],{},[69,1777,1778,1781,1782,1785],{},[81,1779,1780],{},"Disk Usage (%)"," — the ",[126,1783,1784],{},"fstype"," filter excludes Docker overlays and tmpfs mounts that inflate results",[1330,1787,1789],{"className":1715,"code":1788,"language":1717,"meta":419,"style":419},"(1 - (\n  node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay\"} \u002F\n  node_filesystem_size_bytes{fstype!~\"tmpfs|overlay\"}\n)) * 100\n",[126,1790,1791,1796,1801,1806],{"__ignoreMap":419},[1366,1792,1793],{"class":1368,"line":12},[1366,1794,1795],{},"(1 - (\n",[1366,1797,1798],{"class":1368,"line":21},[1366,1799,1800],{},"  node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay\"} \u002F\n",[1366,1802,1803],{"class":1368,"line":30},[1366,1804,1805],{},"  node_filesystem_size_bytes{fstype!~\"tmpfs|overlay\"}\n",[1366,1807,1808],{"class":1368,"line":39},[1366,1809,1810],{},")) * 100\n",[69,1812,1813,1816],{},[81,1814,1815],{},"Fleet status"," — a stat panel showing every host's current state",[1330,1818,1820],{"className":1715,"code":1819,"language":1717,"meta":419,"style":419},"up{job=\"node_exporter\"}\n",[126,1821,1822],{"__ignoreMap":419},[1366,1823,1824],{"class":1368,"line":12},[1366,1825,1819],{},[69,1827,1828,1829,1832,1833,1836],{},"Value mappings: ",[126,1830,1831],{},"1"," → 🟢 UP, ",[126,1834,1835],{},"0"," → 🔴 DOWN.",[69,1838,1839,1840,1843,1844,1847],{},"Adding a dashboard variable for ",[126,1841,1842],{},"instance"," — ",[126,1845,1846],{},"label_values(up{job=\"node_exporter\"}, instance)"," — gives you a dropdown to filter to a specific host or view the whole fleet. That one change makes the dashboard genuinely useful for day-to-day operations.",[538,1849],{},[73,1851,1853],{"id":1852},"alerting","Alerting",[69,1855,1856],{},"Prometheus evaluates alerting rules and forwards firing alerts to AlertManager. AlertManager handles the business logic: who gets notified, when, how often, and what to suppress.",[69,1858,1859],{},"The rules themselves live in separate YAML files:",[1330,1861,1863],{"className":1595,"code":1862,"language":1597,"meta":419,"style":419},"groups:\n  - name: node_exporter_alerts\n    rules:\n\n      - alert: HostDown\n        expr: up{job=\"node_exporter\"} == 0\n        for: 2m\n        labels:\n          severity: critical\n        annotations:\n          summary: \"Host {{ $labels.instance }} is down\"\n          description: >\n            {{ $labels.hostname }} has been unreachable for more than 2 minutes.\n            Maintainers: {{ $labels.maintainers }}\n\n      - alert: HighCPUUsage\n        expr: >\n          100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) > 85\n        for: 5m\n        labels:\n          severity: warning\n        annotations:\n          summary: \"High CPU on {{ $labels.instance }}\"\n          description: >\n            CPU on {{ $labels.hostname }} has been above 85% for 5 minutes.\n            Current: {{ $value | printf \"%.1f\" }}%\n",[126,1864,1865,1872,1884,1891,1896,1908,1918,1928,1935,1945,1952,1966,1978,1984,1990,1995,2007,2016,2022,2032,2039,2049,2056,2070,2079,2085],{"__ignoreMap":419},[1366,1866,1867,1870],{"class":1368,"line":12},[1366,1868,1869],{"class":1455},"groups",[1366,1871,1612],{"class":1440},[1366,1873,1874,1876,1879,1881],{"class":1368,"line":21},[1366,1875,1617],{"class":1440},[1366,1877,1878],{"class":1455}," name",[1366,1880,1462],{"class":1440},[1366,1882,1883],{"class":1381}," node_exporter_alerts\n",[1366,1885,1886,1889],{"class":1368,"line":30},[1366,1887,1888],{"class":1455},"    rules",[1366,1890,1612],{"class":1440},[1366,1892,1893],{"class":1368,"line":39},[1366,1894,1895],{"emptyLinePlaceholder":436},"\n",[1366,1897,1898,1900,1903,1905],{"class":1368,"line":48},[1366,1899,1641],{"class":1440},[1366,1901,1902],{"class":1455}," alert",[1366,1904,1462],{"class":1440},[1366,1906,1907],{"class":1381}," HostDown\n",[1366,1909,1910,1913,1915],{"class":1368,"line":57},[1366,1911,1912],{"class":1455},"        expr",[1366,1914,1462],{"class":1440},[1366,1916,1917],{"class":1381}," up{job=\"node_exporter\"} == 0\n",[1366,1919,1920,1923,1925],{"class":1368,"line":1536},[1366,1921,1922],{"class":1455},"        for",[1366,1924,1462],{"class":1440},[1366,1926,1927],{"class":1381}," 2m\n",[1366,1929,1930,1933],{"class":1368,"line":1557},[1366,1931,1932],{"class":1455},"        labels",[1366,1934,1612],{"class":1440},[1366,1936,1937,1940,1942],{"class":1368,"line":1577},[1366,1938,1939],{"class":1455},"          severity",[1366,1941,1462],{"class":1440},[1366,1943,1944],{"class":1381}," critical\n",[1366,1946,1947,1950],{"class":1368,"line":1583},[1366,1948,1949],{"class":1455},"        annotations",[1366,1951,1612],{"class":1440},[1366,1953,1954,1957,1959,1961,1964],{"class":1368,"line":1589},[1366,1955,1956],{"class":1455},"          summary",[1366,1958,1462],{"class":1440},[1366,1960,1505],{"class":1468},[1366,1962,1963],{"class":1381},"Host {{ $labels.instance }} is down",[1366,1965,1574],{"class":1468},[1366,1967,1969,1972,1974],{"class":1368,"line":1968},12,[1366,1970,1971],{"class":1455},"          description",[1366,1973,1462],{"class":1440},[1366,1975,1977],{"class":1976},"sbBg2"," >\n",[1366,1979,1981],{"class":1368,"line":1980},13,[1366,1982,1983],{"class":1381},"            {{ $labels.hostname }} has been unreachable for more than 2 minutes.\n",[1366,1985,1987],{"class":1368,"line":1986},14,[1366,1988,1989],{"class":1381},"            Maintainers: {{ $labels.maintainers }}\n",[1366,1991,1993],{"class":1368,"line":1992},15,[1366,1994,1895],{"emptyLinePlaceholder":436},[1366,1996,1998,2000,2002,2004],{"class":1368,"line":1997},16,[1366,1999,1641],{"class":1440},[1366,2001,1902],{"class":1455},[1366,2003,1462],{"class":1440},[1366,2005,2006],{"class":1381}," HighCPUUsage\n",[1366,2008,2010,2012,2014],{"class":1368,"line":2009},17,[1366,2011,1912],{"class":1455},[1366,2013,1462],{"class":1440},[1366,2015,1977],{"class":1976},[1366,2017,2019],{"class":1368,"line":2018},18,[1366,2020,2021],{"class":1381},"          100 - (avg by (instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) > 85\n",[1366,2023,2025,2027,2029],{"class":1368,"line":2024},19,[1366,2026,1922],{"class":1455},[1366,2028,1462],{"class":1440},[1366,2030,2031],{"class":1381}," 5m\n",[1366,2033,2035,2037],{"class":1368,"line":2034},20,[1366,2036,1932],{"class":1455},[1366,2038,1612],{"class":1440},[1366,2040,2042,2044,2046],{"class":1368,"line":2041},21,[1366,2043,1939],{"class":1455},[1366,2045,1462],{"class":1440},[1366,2047,2048],{"class":1381}," warning\n",[1366,2050,2052,2054],{"class":1368,"line":2051},22,[1366,2053,1949],{"class":1455},[1366,2055,1612],{"class":1440},[1366,2057,2059,2061,2063,2065,2068],{"class":1368,"line":2058},23,[1366,2060,1956],{"class":1455},[1366,2062,1462],{"class":1440},[1366,2064,1505],{"class":1468},[1366,2066,2067],{"class":1381},"High CPU on {{ $labels.instance }}",[1366,2069,1574],{"class":1468},[1366,2071,2073,2075,2077],{"class":1368,"line":2072},24,[1366,2074,1971],{"class":1455},[1366,2076,1462],{"class":1440},[1366,2078,1977],{"class":1976},[1366,2080,2082],{"class":1368,"line":2081},25,[1366,2083,2084],{"class":1381},"            CPU on {{ $labels.hostname }} has been above 85% for 5 minutes.\n",[1366,2086,2088],{"class":1368,"line":2087},26,[1366,2089,2090],{"class":1381},"            Current: {{ $value | printf \"%.1f\" }}%\n",[69,2092,1682,2093,2096,2097,2099,2100,188,2102,2104],{},[126,2094,2095],{},"for: 2m"," on ",[126,2098,1670],{}," absorbs brief network glitches. Without it, a momentary scrape failure sends an alert. The rich labels on the target — ",[126,2101,1498],{},[126,2103,1562],{}," — show up directly in the alert annotations.",[69,2106,2107],{},"One AlertManager config worth explaining is the inhibit rule:",[1330,2109,2111],{"className":1595,"code":2110,"language":1597,"meta":419,"style":419},"inhibit_rules:\n  - source_match:\n      alertname: \"HostDown\"\n    target_match_re:\n      alertname: \"HighCPUUsage|HighMemoryUsage|DiskSpaceLow\"\n    equal: [\"instance\"]\n",[126,2112,2113,2120,2129,2142,2149,2162],{"__ignoreMap":419},[1366,2114,2115,2118],{"class":1368,"line":12},[1366,2116,2117],{"class":1455},"inhibit_rules",[1366,2119,1612],{"class":1440},[1366,2121,2122,2124,2127],{"class":1368,"line":21},[1366,2123,1617],{"class":1440},[1366,2125,2126],{"class":1455}," source_match",[1366,2128,1612],{"class":1440},[1366,2130,2131,2134,2136,2138,2140],{"class":1368,"line":30},[1366,2132,2133],{"class":1455},"      alertname",[1366,2135,1462],{"class":1440},[1366,2137,1505],{"class":1468},[1366,2139,1670],{"class":1381},[1366,2141,1574],{"class":1468},[1366,2143,2144,2147],{"class":1368,"line":39},[1366,2145,2146],{"class":1455},"    target_match_re",[1366,2148,1612],{"class":1440},[1366,2150,2151,2153,2155,2157,2160],{"class":1368,"line":48},[1366,2152,2133],{"class":1455},[1366,2154,1462],{"class":1440},[1366,2156,1505],{"class":1468},[1366,2158,2159],{"class":1381},"HighCPUUsage|HighMemoryUsage|DiskSpaceLow",[1366,2161,1574],{"class":1468},[1366,2163,2164,2167,2169,2171,2173,2175,2177],{"class":1368,"line":57},[1366,2165,2166],{"class":1455},"    equal",[1366,2168,1462],{"class":1440},[1366,2170,1465],{"class":1440},[1366,2172,1459],{"class":1468},[1366,2174,1842],{"class":1381},[1366,2176,1459],{"class":1468},[1366,2178,1592],{"class":1440},[69,2180,2181,2182,2184,2185,2188],{},"When ",[126,2183,1670],{}," fires for a host, AlertManager suppresses all other alerts for that same host. There's no useful signal in a ",[126,2186,2187],{},"HighMemoryUsage"," alert for a machine that isn't reachable. Without this, a single dead host can generate a cascade of noise.",[538,2190],{},[73,2192,1682,2194,2197],{"id":2193},"the-last_seen-pattern",[126,2195,2196],{},"last_seen"," Pattern",[69,2199,2200,2201,2204,2205,2207],{},"When a host disappears completely, Prometheus eventually stops having active series data for it. ",[126,2202,2203],{},"up{instance=\"...\"}"," doesn't return ",[126,2206,1835],{}," — it returns nothing, because there's no scrape happening. You lose the ability to answer \"when did this thing last check in?\"",[69,2209,2210],{},"A recording rule fixes this by continuously writing a timestamp whenever a host is up:",[1330,2212,2214],{"className":1595,"code":2213,"language":1597,"meta":419,"style":419},"groups:\n  - name: recording_rules\n    rules:\n      - record: node_last_seen_timestamp\n        expr: time() * up{job=\"node_exporter\"}\n",[126,2215,2216,2222,2233,2239,2251],{"__ignoreMap":419},[1366,2217,2218,2220],{"class":1368,"line":12},[1366,2219,1869],{"class":1455},[1366,2221,1612],{"class":1440},[1366,2223,2224,2226,2228,2230],{"class":1368,"line":21},[1366,2225,1617],{"class":1440},[1366,2227,1878],{"class":1455},[1366,2229,1462],{"class":1440},[1366,2231,2232],{"class":1381}," recording_rules\n",[1366,2234,2235,2237],{"class":1368,"line":30},[1366,2236,1888],{"class":1455},[1366,2238,1612],{"class":1440},[1366,2240,2241,2243,2246,2248],{"class":1368,"line":39},[1366,2242,1641],{"class":1440},[1366,2244,2245],{"class":1455}," record",[1366,2247,1462],{"class":1440},[1366,2249,2250],{"class":1381}," node_last_seen_timestamp\n",[1366,2252,2253,2255,2257],{"class":1368,"line":48},[1366,2254,1912],{"class":1455},[1366,2256,1462],{"class":1440},[1366,2258,2259],{"class":1381}," time() * up{job=\"node_exporter\"}\n",[69,2261,2262,2263,2266],{},"This writes the current Unix timestamp on every evaluation cycle, but only when ",[126,2264,2265],{},"up == 1",". When a host goes dark, the last written value persists in storage. In Grafana:",[1330,2268,2270],{"className":1715,"code":2269,"language":1717,"meta":419,"style":419},"time() - node_last_seen_timestamp\n",[126,2271,2272],{"__ignoreMap":419},[1366,2273,2274],{"class":1368,"line":12},[1366,2275,2269],{},[69,2277,2278],{},"Format as duration and you get: \"last seen 3h 22m ago.\" It's a small thing but it's become one of the most-used panels.",[538,2280],{},[73,2282,2284],{"id":2283},"where-this-left-off","Where This Left Off",[69,2286,2287],{},"By the end of the first week, the stack was functionally replacing NagiosXI for host monitoring. Prometheus scraping every host every 15 seconds, dashboards showing the fleet, AlertManager routing alerts with inhibit rules and deduplication, recording rules keeping last-seen timestamps for hosts that went dark.",[69,2289,2290],{},"But there was a question I hadn't resolved yet.",[69,2292,2293],{},"Node Exporter is a single-purpose binary — host metrics and nothing else. The moment we wanted logs or traces from these same hosts, we'd need additional agents running alongside it. And adding a host to monitoring still meant four manual steps: SSH in, install Node Exporter, write the target file, reload Prometheus.",[69,2295,2296],{},"My colleague had been working in parallel, exploring the multiple-exporter approach — a separate binary for each signal type. I'd been looking at Grafana Alloy, which promised a single agent that could handle all of it. We hadn't converged yet, and there were real questions about whether Alloy was ready enough to build on.",[69,2298,2299],{},"That's what Part 2 is about.",[454,2301],{"path":2302},"\u002Fblog\u002Flgtm-stack\u002Fpart-2",[2304,2305,2306],"style",{},"html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .s8zF2, html code.shiki .s8zF2{--shiki-default:#A0ADA0}html pre.shiki code .sySUi, html code.shiki .sySUi{--shiki-default:#59873A}html pre.shiki code .spphp, html code.shiki .spphp{--shiki-default:#B56959}html pre.shiki code .si04Y, html code.shiki .si04Y{--shiki-default:#AB5959}html pre.shiki code .suHK_, html code.shiki .suHK_{--shiki-default:#393A34}html pre.shiki code .sEi1f, html code.shiki .sEi1f{--shiki-default:#A65E2B}html pre.shiki code .sYZai, html code.shiki .sYZai{--shiki-default:#999999}html pre.shiki code .s61at, html code.shiki .s61at{--shiki-default:#99841877}html pre.shiki code .su6XF, html code.shiki .su6XF{--shiki-default:#998418}html pre.shiki code .sSP4y, html code.shiki .sSP4y{--shiki-default:#B5695977}html pre.shiki code .sbBg2, html code.shiki .sbBg2{--shiki-default:#1E754F}",{"title":419,"searchDepth":21,"depth":21,"links":2308},[2309,2310,2311,2312,2314,2315,2316,2318],{"id":1241,"depth":21,"text":1242},{"id":1259,"depth":21,"text":1260},{"id":1349,"depth":21,"text":1350},{"id":1681,"depth":21,"text":2313},"The up Metric",{"id":1740,"depth":21,"text":1741},{"id":1852,"depth":21,"text":1853},{"id":2193,"depth":21,"text":2317},"The last_seen Pattern",{"id":2283,"depth":21,"text":2284},"Blog","2025-01-17","How we migrated from NagiosXI to a modern open-source observability stack — and why getting the foundation right mattered more than I expected.",{},"\u002Fblog\u002Flgtm-stack\u002Fpart-1","10 min",{"title":1222,"description":2321},"blog\u002Flgtm-stack\u002Fpart-1",[2328,1215,971,1161,2329],"Monitoring","DevOps","\u002Fimages\u002Fthumbnails\u002Flgtm-stack\u002Fpart-1.png","7NeM6IsFhBh6e-n5FcTCZrpdQwLcnU1u-1RFk4aMi1A",{"id":2333,"title":2334,"body":2335,"category":2319,"date":3002,"description":3003,"extension":434,"meta":3004,"navigation":436,"path":2302,"readTime":3005,"seo":3006,"stem":3007,"tags":3008,"thumbnail":3009,"__hash__":3010},"blog\u002Fblog\u002Flgtm-stack\u002Fpart-2.md","Building a Production Monitoring Stack from Scratch — Part 2: Grafana Alloy & the Push vs Pull Problem",{"type":66,"value":2336,"toc":2990},[2337,2346,2348,2350,2354,2357,2360,2363,2365,2369,2376,2379,2382,2399,2402,2474,2481,2483,2487,2493,2505,2513,2520,2526,2535,2538,2542,2547,2554,2559,2562,2571,2577,2581,2587,2594,2600,2606,2615,2702,2715,2845,2851,2853,2857,2860,2863,2882,2893,2896,2909,2912,2914,2918,2921,2924,2927,2930,2933,2935,2939,2944,2954,2963,2969,2971,2973,2976,2979,2984,2987],[1226,2338,2339],{},[69,2340,2341,1233,2343],{},[81,2342,1232],{},[81,2344,2345],{},"Part 2 of 4",[454,2347],{"path":2323},[538,2349],{},[73,2351,2353],{"id":2352},"the-open-question-from-part-1","The Open Question from Part 1",[69,2355,2356],{},"By the end of Phase 1, the stack was working. But Node Exporter is a single-purpose binary — host metrics, nothing else. The plan was always to get logs and traces into the same system, which meant we'd eventually need more agents on each host. A separate exporter for postgres metrics, another for nginx, maybe more after that. Each one is another thing to deploy, another thing to update, another thing to break in a subtly different way.",[69,2358,2359],{},"My colleague and I had been running in parallel on this. He was working through the multiple-exporter approach — the established path, a separate binary per signal type. I'd been looking at Grafana Alloy, which promised a single agent that could handle metrics, logs, and traces from one deployed process.",[69,2361,2362],{},"The question was whether Alloy was actually ready to build on.",[538,2364],{},[73,2366,2368],{"id":2367},"what-alloy-is","What Alloy Is",[69,2370,2371,2372,2375],{},"Grafana Alloy is Grafana Labs' open-source observability agent, positioned as the successor to Grafana Agent Flow. It's built around a pipeline model: you define sources, processors, and exporters as typed components and wire them together in ",[126,2373,2374],{},".alloy"," config files.",[69,2377,2378],{},"When I started working with it, it was still fairly new. The Agent Flow rebranding into Alloy had just stabilised, documentation was still filling in gaps, and community examples were sparse. You were going to hit sharp edges. But the direction seemed clearly right — one agent, multiple signals, explicit pipelines.",[69,2380,2381],{},"What made it compelling on paper:",[91,2383,2384,2390,2393,2396],{},[94,2385,2386,2389],{},[126,2387,2388],{},"prometheus.exporter.unix"," replicates Node Exporter's collectors without a separate binary",[94,2391,2392],{},"First-class support for OpenTelemetry receivers and exporters",[94,2394,2395],{},"Composable pipeline configs where data flow is visible and readable",[94,2397,2398],{},"Application metric endpoints (postgres, nginx, etc.) accessible as pipeline components",[69,2400,2401],{},"The config model is clean. Here's a simple pipeline — collect host metrics, send to Prometheus:",[1330,2403,2407],{"className":2404,"code":2405,"language":2406,"meta":419,"style":419},"language-alloy shiki shiki-themes vitesse-light","prometheus.exporter.unix \"localhost\" {\n  set_collectors = [\"cpu\", \"meminfo\", \"diskstats\", \"filesystem\", \"netdev\", \"loadavg\"]\n}\n\nprometheus.scrape \"node\" {\n  targets    = prometheus.exporter.unix.localhost.targets\n  forward_to = [prometheus.remote_write.default.receiver]\n}\n\nprometheus.remote_write \"default\" {\n  endpoint {\n    url = \"http:\u002F\u002Fprometheus:9090\u002Fapi\u002Fv1\u002Fwrite\"\n  }\n}\n","alloy",[126,2408,2409,2414,2419,2424,2428,2433,2438,2443,2447,2451,2456,2461,2466,2470],{"__ignoreMap":419},[1366,2410,2411],{"class":1368,"line":12},[1366,2412,2413],{},"prometheus.exporter.unix \"localhost\" {\n",[1366,2415,2416],{"class":1368,"line":21},[1366,2417,2418],{},"  set_collectors = [\"cpu\", \"meminfo\", \"diskstats\", \"filesystem\", \"netdev\", \"loadavg\"]\n",[1366,2420,2421],{"class":1368,"line":30},[1366,2422,2423],{},"}\n",[1366,2425,2426],{"class":1368,"line":39},[1366,2427,1895],{"emptyLinePlaceholder":436},[1366,2429,2430],{"class":1368,"line":48},[1366,2431,2432],{},"prometheus.scrape \"node\" {\n",[1366,2434,2435],{"class":1368,"line":57},[1366,2436,2437],{},"  targets    = prometheus.exporter.unix.localhost.targets\n",[1366,2439,2440],{"class":1368,"line":1536},[1366,2441,2442],{},"  forward_to = [prometheus.remote_write.default.receiver]\n",[1366,2444,2445],{"class":1368,"line":1557},[1366,2446,2423],{},[1366,2448,2449],{"class":1368,"line":1577},[1366,2450,1895],{"emptyLinePlaceholder":436},[1366,2452,2453],{"class":1368,"line":1583},[1366,2454,2455],{},"prometheus.remote_write \"default\" {\n",[1366,2457,2458],{"class":1368,"line":1589},[1366,2459,2460],{},"  endpoint {\n",[1366,2462,2463],{"class":1368,"line":1968},[1366,2464,2465],{},"    url = \"http:\u002F\u002Fprometheus:9090\u002Fapi\u002Fv1\u002Fwrite\"\n",[1366,2467,2468],{"class":1368,"line":1980},[1366,2469,1586],{},[1366,2471,2472],{"class":1368,"line":1986},[1366,2473,2423],{},[69,2475,2476,2477,2480],{},"That ",[126,2478,2479],{},"remote_write"," line introduced a problem I didn't see coming.",[538,2482],{},[73,2484,2486],{"id":2485},"the-push-vs-pull-problem","The Push vs Pull Problem",[69,2488,2489,2490,2492],{},"Prometheus is a pull-based system. It owns the scrape cycle — it reaches out to targets, pulls metrics, and as a side effect of each successful scrape, generates a synthetic ",[126,2491,1685],{}," metric:",[91,2494,2495,2500],{},[94,2496,2497,2499],{},[126,2498,1699],{}," — scrape succeeded, host is reachable",[94,2501,2502,2504],{},[126,2503,1705],{}," — scrape failed, something is wrong",[69,2506,2507,2509,2510,2512],{},[126,2508,1685],{}," isn't something your application exports. Prometheus generates it internally, based on whether the HTTP request to ",[126,2511,1356],{}," succeeded. The entire alerting chain from Part 1 depended on it.",[69,2514,2515,2516,2519],{},"When Alloy uses ",[126,2517,2518],{},"prometheus.remote_write",", the data flow reverses. Alloy pushes metrics to Prometheus via HTTP POST. Prometheus sits passively and receives what's sent.",[69,2521,2522,2523,2525],{},"And that means Prometheus never scrapes these hosts. So Prometheus never generates ",[126,2524,1685],{}," for them.",[69,2527,2528,2529,2531,2532,2534],{},"The first time I checked the Prometheus targets page after switching to push-based Alloy, those hosts weren't in the targets list at all. Not showing ",[126,2530,1705],{}," — not there at all. Prometheus had no scrape config for them; it was just receiving a stream of metrics it hadn't asked for. The ",[126,2533,1685],{}," metric had silently disappeared, and everything built on top of it — every alert, every \"host is down\" notification — had gone with it.",[69,2536,2537],{},"This was the kind of failure that wouldn't surface immediately. The dashboards still had data. Metrics were still flowing. It would only become obvious the next time a host actually went down and nobody got paged.",[86,2539,2541],{"id":2540},"the-workarounds-i-tried","The Workarounds I Tried",[69,2543,2544],{},[81,2545,2546],{},"Heartbeat metric from Alloy's internal health",[69,2548,2549,2550,2553],{},"Alloy exposes internal component status metrics. You can check if the pipeline is running. But this only tells you Alloy is alive on the server side — it says nothing about whether the ",[1673,2551,2552],{},"host"," is reachable. A host could be completely unreachable and Alloy's own health metrics would look fine from Prometheus's perspective, because Prometheus was never reaching out to check.",[69,2555,2556],{},[81,2557,2558],{},"Staleness detection via timestamp",[69,2560,2561],{},"If a host stops pushing, its metrics go stale. You can detect this:",[1330,2563,2565],{"className":1715,"code":2564,"language":1717,"meta":419,"style":419},"(time() - max by (instance) (timestamp(node_cpu_seconds_total))) > 120\n",[126,2566,2567],{"__ignoreMap":419},[1366,2568,2569],{"class":1368,"line":12},[1366,2570,2564],{},[69,2572,2573,2574,2576],{},"This technically works. But it's fragile — dependent on a specific metric being present and recently written, prone to false positives from remote_write buffer lag or brief network hiccups. And it means rewriting every alert and dashboard around staleness rather than the clean binary ",[126,2575,1685],{}," signal. It felt like building on sand.",[86,2578,2580],{"id":2579},"the-actual-fix","The Actual Fix",[69,2582,2583,2584],{},"After long enough on the workarounds, the right answer was simpler: ",[81,2585,2586],{},"keep Prometheus pulling, just pull from Alloy's HTTP endpoint instead of a standalone Node Exporter binary.",[69,2588,2589,2590,2593],{},"Alloy exposes an HTTP API on port ",[126,2591,2592],{},"12345",". Every component that produces metrics is accessible at a path under that API:",[1330,2595,2598],{"className":2596,"code":2597,"language":1335},[1333],"http:\u002F\u002F\u003Chost>:12345\u002Fapi\u002Fv0\u002Fcomponent\u002Fprometheus.exporter.unix.localhost\u002Fmetrics\n",[126,2599,2597],{"__ignoreMap":419},[69,2601,2602,2603,2605],{},"This is a plain HTTP endpoint serving Prometheus text format — exactly what Node Exporter served on port 9100. Prometheus can scrape it exactly like any other target. When it does, it generates ",[126,2604,1685],{},". Everything from Part 1 works without modification.",[69,2607,2608,2609,2612,2613,1462],{},"The Alloy config on each host becomes simpler, not more complex — no ",[126,2610,2611],{},"prometheus.scrape",", no ",[126,2614,2518],{},[1330,2616,2618],{"className":2404,"code":2617,"language":2406,"meta":419,"style":419},"prometheus.exporter.unix \"localhost\" {\n  set_collectors = [\n    \"cpu\",\n    \"meminfo\",\n    \"diskstats\",\n    \"filesystem\",\n    \"netdev\",\n    \"loadavg\",\n    \"uname\",\n    \"time\",\n    \"systemd\",\n    \"processes\",\n  ]\n}\n\n\u002F\u002F Prometheus will pull from:\n\u002F\u002F http:\u002F\u002F\u003Chost>:12345\u002Fapi\u002Fv0\u002Fcomponent\u002Fprometheus.exporter.unix.localhost\u002Fmetrics\n",[126,2619,2620,2624,2629,2634,2639,2644,2649,2654,2659,2664,2669,2674,2679,2684,2688,2692,2697],{"__ignoreMap":419},[1366,2621,2622],{"class":1368,"line":12},[1366,2623,2413],{},[1366,2625,2626],{"class":1368,"line":21},[1366,2627,2628],{},"  set_collectors = [\n",[1366,2630,2631],{"class":1368,"line":30},[1366,2632,2633],{},"    \"cpu\",\n",[1366,2635,2636],{"class":1368,"line":39},[1366,2637,2638],{},"    \"meminfo\",\n",[1366,2640,2641],{"class":1368,"line":48},[1366,2642,2643],{},"    \"diskstats\",\n",[1366,2645,2646],{"class":1368,"line":57},[1366,2647,2648],{},"    \"filesystem\",\n",[1366,2650,2651],{"class":1368,"line":1536},[1366,2652,2653],{},"    \"netdev\",\n",[1366,2655,2656],{"class":1368,"line":1557},[1366,2657,2658],{},"    \"loadavg\",\n",[1366,2660,2661],{"class":1368,"line":1577},[1366,2662,2663],{},"    \"uname\",\n",[1366,2665,2666],{"class":1368,"line":1583},[1366,2667,2668],{},"    \"time\",\n",[1366,2670,2671],{"class":1368,"line":1589},[1366,2672,2673],{},"    \"systemd\",\n",[1366,2675,2676],{"class":1368,"line":1968},[1366,2677,2678],{},"    \"processes\",\n",[1366,2680,2681],{"class":1368,"line":1980},[1366,2682,2683],{},"  ]\n",[1366,2685,2686],{"class":1368,"line":1986},[1366,2687,2423],{},[1366,2689,2690],{"class":1368,"line":1992},[1366,2691,1895],{"emptyLinePlaceholder":436},[1366,2693,2694],{"class":1368,"line":1997},[1366,2695,2696],{},"\u002F\u002F Prometheus will pull from:\n",[1366,2698,2699],{"class":1368,"line":2009},[1366,2700,2701],{},"\u002F\u002F http:\u002F\u002F\u003Chost>:12345\u002Fapi\u002Fv0\u002Fcomponent\u002Fprometheus.exporter.unix.localhost\u002Fmetrics\n",[69,2703,2704,2705,2707,2708,2711,2712,2714],{},"The target file for each Alloy host points to port ",[126,2706,2592],{}," and uses ",[126,2709,2710],{},"__metrics_path__"," — a special Prometheus label that overrides the default ",[126,2713,1356],{}," scrape path — to point at the correct component endpoint:",[1330,2716,2718],{"className":1431,"code":2717,"language":1433,"meta":419,"style":419},"[\n  {\n    \"targets\": [\"10.200.3.23:12345\"],\n    \"labels\": {\n      \"hostname\": \"cloud-network-3\",\n      \"environment\": \"production\",\n      \"maintainers\": \"admin@domain.com\",\n      \"__metrics_path__\": \"\u002Fapi\u002Fv0\u002Fcomponent\u002Fprometheus.exporter.unix.localhost\u002Fmetrics\"\n    }\n  }\n]\n",[126,2719,2720,2724,2728,2749,2761,2780,2798,2816,2833,2837,2841],{"__ignoreMap":419},[1366,2721,2722],{"class":1368,"line":12},[1366,2723,1441],{"class":1440},[1366,2725,2726],{"class":1368,"line":21},[1366,2727,1446],{"class":1440},[1366,2729,2730,2732,2734,2736,2738,2740,2742,2745,2747],{"class":1368,"line":30},[1366,2731,1452],{"class":1451},[1366,2733,1456],{"class":1455},[1366,2735,1459],{"class":1451},[1366,2737,1462],{"class":1440},[1366,2739,1465],{"class":1440},[1366,2741,1459],{"class":1468},[1366,2743,2744],{"class":1381},"10.200.3.23:12345",[1366,2746,1459],{"class":1468},[1366,2748,1476],{"class":1440},[1366,2750,2751,2753,2755,2757,2759],{"class":1368,"line":39},[1366,2752,1452],{"class":1451},[1366,2754,1483],{"class":1455},[1366,2756,1459],{"class":1451},[1366,2758,1462],{"class":1440},[1366,2760,1490],{"class":1440},[1366,2762,2763,2765,2767,2769,2771,2773,2776,2778],{"class":1368,"line":48},[1366,2764,1495],{"class":1451},[1366,2766,1498],{"class":1455},[1366,2768,1459],{"class":1451},[1366,2770,1462],{"class":1440},[1366,2772,1505],{"class":1468},[1366,2774,2775],{"class":1381},"cloud-network-3",[1366,2777,1459],{"class":1468},[1366,2779,1513],{"class":1440},[1366,2781,2782,2784,2786,2788,2790,2792,2794,2796],{"class":1368,"line":57},[1366,2783,1495],{"class":1451},[1366,2785,1520],{"class":1455},[1366,2787,1459],{"class":1451},[1366,2789,1462],{"class":1440},[1366,2791,1505],{"class":1468},[1366,2793,1529],{"class":1381},[1366,2795,1459],{"class":1468},[1366,2797,1513],{"class":1440},[1366,2799,2800,2802,2804,2806,2808,2810,2812,2814],{"class":1368,"line":1536},[1366,2801,1495],{"class":1451},[1366,2803,1562],{"class":1455},[1366,2805,1459],{"class":1451},[1366,2807,1462],{"class":1440},[1366,2809,1505],{"class":1468},[1366,2811,1571],{"class":1381},[1366,2813,1459],{"class":1468},[1366,2815,1513],{"class":1440},[1366,2817,2818,2820,2822,2824,2826,2828,2831],{"class":1368,"line":1557},[1366,2819,1495],{"class":1451},[1366,2821,2710],{"class":1455},[1366,2823,1459],{"class":1451},[1366,2825,1462],{"class":1440},[1366,2827,1505],{"class":1468},[1366,2829,2830],{"class":1381},"\u002Fapi\u002Fv0\u002Fcomponent\u002Fprometheus.exporter.unix.localhost\u002Fmetrics",[1366,2832,1574],{"class":1468},[1366,2834,2835],{"class":1368,"line":1577},[1366,2836,1580],{"class":1440},[1366,2838,2839],{"class":1368,"line":1583},[1366,2840,1586],{"class":1440},[1366,2842,2843],{"class":1368,"line":1589},[1366,2844,1592],{"class":1440},[69,2846,2847,2848,2850],{},"Prometheus scrapes it, gets back standard metrics, generates ",[126,2849,1699],{},", stores everything as normal. The alerting chain is intact.",[538,2852],{},[73,2854,2856],{"id":2855},"getting-application-metrics-in-the-part-that-nearly-broke-alloy-for-us","Getting Application Metrics In: The Part That Nearly Broke Alloy for Us",[69,2858,2859],{},"The unix exporter was the straightforward part. The harder part was getting application-level metrics — specifically postgres and nginx — through Alloy rather than through separate exporter binaries.",[69,2861,2862],{},"Alloy has built-in components for both. The postgres one:",[1330,2864,2866],{"className":2404,"code":2865,"language":2406,"meta":419,"style":419},"prometheus.exporter.postgres \"db\" {\n  data_source_names = [\"postgresql:\u002F\u002Fuser:pass@localhost:5432\u002Fmydb?sslmode=disable\"]\n}\n",[126,2867,2868,2873,2878],{"__ignoreMap":419},[1366,2869,2870],{"class":1368,"line":12},[1366,2871,2872],{},"prometheus.exporter.postgres \"db\" {\n",[1366,2874,2875],{"class":1368,"line":21},[1366,2876,2877],{},"  data_source_names = [\"postgresql:\u002F\u002Fuser:pass@localhost:5432\u002Fmydb?sslmode=disable\"]\n",[1366,2879,2880],{"class":1368,"line":30},[1366,2881,2423],{},[69,2883,2884,2885,2888,2889,2892],{},"I tried the nginx equivalent first. Alloy has ",[126,2886,2887],{},"prometheus.exporter.nginx",", which connects to nginx's ",[126,2890,2891],{},"stub_status"," endpoint and pulls metrics from it. I set it up, checked the output — nothing. No metrics, no errors, just silence. I spent time on it, checked the nginx config, checked the Alloy config, tried different approaches. At some point I started thinking seriously about just installing the standalone nginx exporter and giving up on Alloy for application metrics entirely.",[69,2894,2895],{},"Before doing that, I tried the postgres component instead. It worked immediately — metrics flowing through on the first attempt. That was the signal I needed. If postgres worked, nginx should work too. The problem wasn't Alloy. Something was wrong with my specific nginx setup.",[69,2897,2898,2899,2901,2902,2904,2905,2908],{},"I went back to nginx, looked more carefully at the ",[126,2900,2891],{}," configuration, and found it. The endpoint wasn't properly enabled — the nginx config had the ",[126,2903,2891],{}," block but it was only accessible from ",[126,2906,2907],{},"127.0.0.1",", and Alloy was trying to reach it in a way that wasn't matching that restriction. A small fix, and nginx metrics started flowing.",[69,2910,2911],{},"The near-ditch was worth it. Running separate exporters for every application would have meant my colleague's approach and my approach converging on the same place — a proliferation of binaries per host. The whole point of Alloy was avoiding that.",[538,2913],{},[73,2915,2917],{"id":2916},"why-alloy-over-multiple-exporters","Why Alloy Over Multiple Exporters",[69,2919,2920],{},"My colleague's multiple-exporter approach was working. It's the established path, well-documented, stable. The case for it isn't wrong.",[69,2922,2923],{},"But the case for Alloy is better for where we're going. The moment you want logs — which was always the plan — you need another agent anyway. If you're already running Node Exporter, postgres exporter, and nginx exporter, you're at three binaries per host. Adding a log agent makes four. Each one needs to be deployed, configured, updated, and monitored independently.",[69,2925,2926],{},"With Alloy, adding logs is another component in the same config file on the same process. Adding traces is the same. The operational footprint stays at one agent per host regardless of how many signals you're collecting.",[69,2928,2929],{},"There's also the matter of the pipeline model. When you have a single config that describes exactly what data is flowing where, debugging is straightforward. With four separate agents running independently, understanding the full picture requires checking four separate processes.",[69,2931,2932],{},"The sharp edges were real — the push vs pull problem cost me real time, the nginx issue nearly derailed the whole approach. But those were solvable problems. The structural limitation of multiple exporters — complexity that compounds as you add signals — isn't.",[538,2934],{},[73,2936,2938],{"id":2937},"what-changed-in-grafana","What Changed in Grafana",[69,2940,2941,2943],{},[126,2942,2388],{}," uses the same metric names as standalone Node Exporter — it's built on the same underlying collectors. Every PromQL query from Part 1 works unchanged.",[69,2945,2946,2947,2949,2950,2953],{},"The one real change is how ",[126,2948,1685],{}," is queried. With Alloy, the job label becomes ",[126,2951,2952],{},"\"alloy\"",", and filtering is done through the richer label set on each target — environment, priority, instance — rather than anything tied to a port or exporter binary. For example, the fleet status panel:",[1330,2955,2957],{"className":1715,"code":2956,"language":1717,"meta":419,"style":419},"count(up{job=\"alloy\", priority=~\"${priority}\", environment=~\"${environment}\", instance!~\"^localhost.*\", instance=~\"${instance}\"} == 1) or vector(0)\n",[126,2958,2959],{"__ignoreMap":419},[1366,2960,2961],{"class":1368,"line":12},[1366,2962,2956],{},[69,2964,1682,2965,2968],{},[126,2966,2967],{},"or vector(0)"," ensures the panel returns zero rather than no data when nothing matches — a small thing that matters when you're staring at a dashboard at 2am wondering if the query is broken or the hosts are genuinely all down.",[538,2970],{},[73,2972,2284],{"id":2283},[69,2974,2975],{},"By the end of Phase 2, the stack had a single agent per host handling metrics across system and application layers, pull-based scraping preserved so all the alerting machinery from Phase 1 still worked, and OTel ports open on the Alloy container for what came next.",[69,2977,2978],{},"The next gap was observability beyond metrics. Host CPU and memory tell you a machine is struggling; they don't tell you why, or what a request was doing when it failed. That meant Loki for logs and Tempo for distributed traces.",[69,2980,2981],{},[81,2982,2983],{},"Next:",[454,2985],{"path":2986},"\u002Fblog\u002Flgtm-stack\u002Fpart-3",[2304,2988,2989],{},"html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sYZai, html code.shiki .sYZai{--shiki-default:#999999}html pre.shiki code .s61at, html code.shiki .s61at{--shiki-default:#99841877}html pre.shiki code .su6XF, html code.shiki .su6XF{--shiki-default:#998418}html pre.shiki code .sSP4y, html code.shiki .sSP4y{--shiki-default:#B5695977}html pre.shiki code .spphp, html code.shiki .spphp{--shiki-default:#B56959}",{"title":419,"searchDepth":21,"depth":21,"links":2991},[2992,2993,2994,2998,2999,3000,3001],{"id":2352,"depth":21,"text":2353},{"id":2367,"depth":21,"text":2368},{"id":2485,"depth":21,"text":2486,"children":2995},[2996,2997],{"id":2540,"depth":30,"text":2541},{"id":2579,"depth":30,"text":2580},{"id":2855,"depth":21,"text":2856},{"id":2916,"depth":21,"text":2917},{"id":2937,"depth":21,"text":2938},{"id":2283,"depth":21,"text":2284},"2026-05-18","Why we replaced individual exporters with Grafana Alloy, why push-based metrics silently broke our alerting, and what it took to figure that out.",{},"11 min",{"title":2334,"description":3003},"blog\u002Flgtm-stack\u002Fpart-2",[2328,1215,1216,971,2329],"\u002Fimages\u002Fthumbnails\u002Flgtm-stack\u002Fpart-2.png","5rTJAJ6R9qZvwa3_G-JIs2egK0cngWF_IBU66dyKnZk",{"id":3012,"title":3013,"body":3014,"category":2319,"date":3621,"description":3622,"extension":434,"meta":3623,"navigation":436,"path":2986,"readTime":2324,"seo":3624,"stem":3625,"tags":3626,"thumbnail":3627,"__hash__":3628},"blog\u002Fblog\u002Flgtm-stack\u002Fpart-3.md","Building a Production Monitoring Stack from Scratch — Part 3: Loki, Tempo & the Full Observability Picture",{"type":66,"value":3015,"toc":3611},[3016,3025,3027,3029,3031,3035,3038,3041,3043,3047,3050,3056,3062,3071,3077,3079,3083,3086,3099,3219,3222,3224,3228,3231,3238,3249,3252,3258,3261,3272,3274,3277,3280,3352,3355,3358,3451,3453,3457,3460,3487,3490,3493,3541,3550,3553,3555,3559,3562,3579,3582,3588,3591,3593,3595,3598,3601,3605,3608],[1226,3017,3018],{},[69,3019,3020,1233,3022],{},[81,3021,1232],{},[81,3023,3024],{},"Part 3 of 4",[454,3026],{"path":2323},[454,3028],{"path":2302},[538,3030],{},[73,3032,3034],{"id":3033},"what-was-still-missing","What Was Still Missing",[69,3036,3037],{},"After Parts 1 and 2, host health was solid. Prometheus pulling from Alloy on every node, dashboards showing the fleet, AlertManager firing when something went wrong. But all of that is infrastructure-level visibility — CPU spiking, disk filling, host going dark. It tells you a machine is struggling. It doesn't tell you what was happening inside your applications when it did.",[69,3039,3040],{},"For that you need logs and traces. This part covers adding both — Loki for log aggregation and Tempo for distributed tracing — and how the central Alloy instance on mon-node-a ties all three signal types together.",[538,3042],{},[73,3044,3046],{"id":3045},"two-alloy-roles","Two Alloy Roles",[69,3048,3049],{},"Before getting into Loki and Tempo, it's worth being clear about something that can cause confusion: there are two distinct Alloy deployments in this setup and they do completely different things.",[69,3051,3052,3055],{},[81,3053,3054],{},"Alloy on each enrolled node"," — runs the unix exporter, exposes host metrics on port 12345, gets scraped by Prometheus. This is the pull-based setup from Part 2. Nothing changes here.",[69,3057,3058,3061],{},[81,3059,3060],{},"Central Alloy on mon-node-a"," — runs as a container alongside Prometheus, Loki, and Tempo. Opens OTel endpoints on ports 4317 (gRPC) and 4318 (HTTP). Any instrumented application sends its telemetry here, and Alloy routes each signal type to the right backend.",[69,3063,3064,3065,3067,3068,3070],{},"The separation is clean: node Alloy handles infrastructure signals via pull, central Alloy handles application signals via OTel push. Prometheus only scrapes the node Alloys. The ",[126,3066,1685],{}," metric concern from Part 2 doesn't apply here — we're not relying on ",[126,3069,1685],{}," for application health, only for host availability.",[1330,3072,3075],{"className":3073,"code":3074,"language":1335},[1333],"[ Enrolled Nodes ]\n      |\n  Alloy :12345  (one per node, host metrics)\n      |\n      ↓  PULL\n[ Prometheus ]  →  AlertManager\n      ↑\n      |  remote_write (application metrics only)\n      |\n[ Central Alloy :4317\u002F:4318 ]  ←  instrumented applications (OTel)\n      |\n      ├──→ Tempo      (traces)\n      └──→ Loki       (logs)\n\n[ Grafana ]  ←── queries Prometheus, Loki, Tempo\n",[126,3076,3074],{"__ignoreMap":419},[538,3078],{},[73,3080,3082],{"id":3081},"storage-minio","Storage: MinIO",[69,3084,3085],{},"Both Loki and Tempo need a durable storage backend. In a cloud environment that would be S3. Here, MinIO provides an S3-compatible store running as a container on mon-node-a.",[69,3087,3088,3089,188,3092,356,3095,3098],{},"Three buckets: ",[126,3090,3091],{},"loki-data",[126,3093,3094],{},"loki-ruler",[126,3096,3097],{},"tempo",". The entrypoint script pre-creates the directories before MinIO starts — a small thing that saves a confusing startup failure on first run.",[1330,3100,3102],{"className":1595,"code":3101,"language":1597,"meta":419,"style":419},"minio:\n  image: minio\u002Fminio:latest\n  environment:\n    - MINIO_ACCESS_KEY=observability\n    - MINIO_SECRET_KEY=supersecret\n  entrypoint:\n    - sh\n    - -euc\n    - |\n      mkdir -p \u002Fdata\u002Ftempo\n      mkdir -p \u002Fdata\u002Floki-data\n      mkdir -p \u002Fdata\u002Floki-ruler\n      minio server \u002Fdata --console-address ':9001'\n  networks:\n    - monitoring\n  volumes:\n    - .\u002Fdata\u002Fminio:\u002Fdata\n",[126,3103,3104,3111,3121,3128,3136,3143,3150,3157,3164,3171,3176,3181,3186,3191,3198,3205,3212],{"__ignoreMap":419},[1366,3105,3106,3109],{"class":1368,"line":12},[1366,3107,3108],{"class":1455},"minio",[1366,3110,1612],{"class":1440},[1366,3112,3113,3116,3118],{"class":1368,"line":21},[1366,3114,3115],{"class":1455},"  image",[1366,3117,1462],{"class":1440},[1366,3119,3120],{"class":1381}," minio\u002Fminio:latest\n",[1366,3122,3123,3126],{"class":1368,"line":30},[1366,3124,3125],{"class":1455},"  environment",[1366,3127,1612],{"class":1440},[1366,3129,3130,3133],{"class":1368,"line":39},[1366,3131,3132],{"class":1440},"    -",[1366,3134,3135],{"class":1381}," MINIO_ACCESS_KEY=observability\n",[1366,3137,3138,3140],{"class":1368,"line":48},[1366,3139,3132],{"class":1440},[1366,3141,3142],{"class":1381}," MINIO_SECRET_KEY=supersecret\n",[1366,3144,3145,3148],{"class":1368,"line":57},[1366,3146,3147],{"class":1455},"  entrypoint",[1366,3149,1612],{"class":1440},[1366,3151,3152,3154],{"class":1368,"line":1536},[1366,3153,3132],{"class":1440},[1366,3155,3156],{"class":1381}," sh\n",[1366,3158,3159,3161],{"class":1368,"line":1557},[1366,3160,3132],{"class":1440},[1366,3162,3163],{"class":1381}," -euc\n",[1366,3165,3166,3168],{"class":1368,"line":1577},[1366,3167,3132],{"class":1440},[1366,3169,3170],{"class":1976}," |\n",[1366,3172,3173],{"class":1368,"line":1583},[1366,3174,3175],{"class":1381},"      mkdir -p \u002Fdata\u002Ftempo\n",[1366,3177,3178],{"class":1368,"line":1589},[1366,3179,3180],{"class":1381},"      mkdir -p \u002Fdata\u002Floki-data\n",[1366,3182,3183],{"class":1368,"line":1968},[1366,3184,3185],{"class":1381},"      mkdir -p \u002Fdata\u002Floki-ruler\n",[1366,3187,3188],{"class":1368,"line":1980},[1366,3189,3190],{"class":1381},"      minio server \u002Fdata --console-address ':9001'\n",[1366,3192,3193,3196],{"class":1368,"line":1986},[1366,3194,3195],{"class":1455},"  networks",[1366,3197,1612],{"class":1440},[1366,3199,3200,3202],{"class":1368,"line":1992},[1366,3201,3132],{"class":1440},[1366,3203,3204],{"class":1381}," monitoring\n",[1366,3206,3207,3210],{"class":1368,"line":1997},[1366,3208,3209],{"class":1455},"  volumes",[1366,3211,1612],{"class":1440},[1366,3213,3214,3216],{"class":1368,"line":2009},[1366,3215,3132],{"class":1440},[1366,3217,3218],{"class":1381}," .\u002Fdata\u002Fminio:\u002Fdata\n",[69,3220,3221],{},"The MinIO web console on port 9001 is useful when first bringing things up — you can watch objects appearing in the buckets and confirm that Loki and Tempo are actually flushing data rather than buffering it indefinitely.",[538,3223],{},[73,3225,3227],{"id":3226},"loki","Loki",[69,3229,3230],{},"Loki runs in microservices mode with read, write, and backend roles as separate containers, each with three replicas. The read and write paths scale independently, which matters as log volume grows.",[69,3232,3233,3234,3237],{},"A ",[126,3235,3236],{},"loki-init"," container runs first to set correct directory ownership — Loki processes run as UID 10001 and the volume mount needs to reflect that before anything starts.",[69,3239,3240,3241,3244,3245,3248],{},"All external traffic goes through an nginx gateway in front of the cluster. Central Alloy pushes logs to ",[126,3242,3243],{},"http:\u002F\u002Floki-gateway:3100\u002Floki\u002Fapi\u002Fv1\u002Fpush",". Grafana queries ",[126,3246,3247],{},"http:\u002F\u002Floki-gateway:3100",". Neither needs to know which replica handles a given request.",[69,3250,3251],{},"A few config decisions worth noting:",[69,3253,3254,3257],{},[126,3255,3256],{},"s3forcepathstyle: true"," is required when talking to MinIO — it uses path-style URLs rather than the virtual-hosted style AWS uses, and without this flag nothing stores correctly.",[69,3259,3260],{},"Replication factor 3 means each chunk is written to all three write replicas. Since they all back onto the same MinIO instance this is about write redundancy rather than independent storage — but it means the cluster survives a replica restart without data loss in the WAL.",[69,3262,3263,3264,3267,3268,3271],{},"The three component types discover each other via memberlist gossip on port 7946, joining by container name. Getting the ",[126,3265,3266],{},"join_members"," list right — ",[126,3269,3270],{},"[\"loki-read\", \"loki-write\", \"loki-backend\"]"," — is what brings the cluster together.",[538,3273],{},[73,3275,3276],{"id":3097},"Tempo",[69,3278,3279],{},"Tempo also runs in microservices mode. The components and what each does:",[873,3281,3282,3290],{},[876,3283,3284],{},[879,3285,3286,3288],{},[882,3287,1272],{},[882,3289,1275],{},[889,3291,3292,3302,3312,3322,3332,3342],{},[879,3293,3294,3299],{},[894,3295,3296],{},[126,3297,3298],{},"tempo-distributor",[894,3300,3301],{},"Receives traces from Alloy, routes to ingesters",[879,3303,3304,3309],{},[894,3305,3306],{},[126,3307,3308],{},"tempo-ingester-0\u002F1\u002F2",[894,3310,3311],{},"Buffers traces in memory, flushes to MinIO",[879,3313,3314,3319],{},[894,3315,3316],{},[126,3317,3318],{},"tempo-query-frontend",[894,3320,3321],{},"Entry point for Grafana queries",[879,3323,3324,3329],{},[894,3325,3326],{},[126,3327,3328],{},"tempo-querier",[894,3330,3331],{},"Executes queries against ingesters and object storage",[879,3333,3334,3339],{},[894,3335,3336],{},[126,3337,3338],{},"tempo-compactor",[894,3340,3341],{},"Merges and compacts trace blocks",[879,3343,3344,3349],{},[894,3345,3346],{},[126,3347,3348],{},"tempo-metrics-generator",[894,3350,3351],{},"Derives RED metrics from trace data, writes to Prometheus",[69,3353,3354],{},"The metrics generator is worth understanding. It reads incoming traces and derives standard RED metrics — Rate, Errors, Duration — then writes them back to Prometheus via remote_write. The practical effect is that you get service-level dashboards showing request rates, error rates, and latency percentiles automatically from trace data, without any additional metric instrumentation in your applications. The traces are the source of truth; Tempo does the calculation.",[69,3356,3357],{},"It also builds a service dependency graph from trace data that Grafana can render as an interactive topology map — which services call which, with live latency and error rates on each edge.",[1330,3359,3361],{"className":1595,"code":3360,"language":1597,"meta":419,"style":419},"metrics_generator:\n  storage:\n    remote_write:\n      - url: http:\u002F\u002Fprometheus:9090\u002Fapi\u002Fv1\u002Fwrite\n        send_exemplars: true\n  processor:\n    service_graphs:\n      wait: 10s\n      max_items: 10000\n      workers: 10\n",[126,3362,3363,3370,3377,3384,3396,3406,3413,3420,3430,3441],{"__ignoreMap":419},[1366,3364,3365,3368],{"class":1368,"line":12},[1366,3366,3367],{"class":1455},"metrics_generator",[1366,3369,1612],{"class":1440},[1366,3371,3372,3375],{"class":1368,"line":21},[1366,3373,3374],{"class":1455},"  storage",[1366,3376,1612],{"class":1440},[1366,3378,3379,3382],{"class":1368,"line":30},[1366,3380,3381],{"class":1455},"    remote_write",[1366,3383,1612],{"class":1440},[1366,3385,3386,3388,3391,3393],{"class":1368,"line":39},[1366,3387,1641],{"class":1440},[1366,3389,3390],{"class":1455}," url",[1366,3392,1462],{"class":1440},[1366,3394,3395],{"class":1381}," http:\u002F\u002Fprometheus:9090\u002Fapi\u002Fv1\u002Fwrite\n",[1366,3397,3398,3401,3403],{"class":1368,"line":48},[1366,3399,3400],{"class":1455},"        send_exemplars",[1366,3402,1462],{"class":1440},[1366,3404,3405],{"class":1976}," true\n",[1366,3407,3408,3411],{"class":1368,"line":57},[1366,3409,3410],{"class":1455},"  processor",[1366,3412,1612],{"class":1440},[1366,3414,3415,3418],{"class":1368,"line":1536},[1366,3416,3417],{"class":1455},"    service_graphs",[1366,3419,1612],{"class":1440},[1366,3421,3422,3425,3427],{"class":1368,"line":1557},[1366,3423,3424],{"class":1455},"      wait",[1366,3426,1462],{"class":1440},[1366,3428,3429],{"class":1381}," 10s\n",[1366,3431,3432,3435,3437],{"class":1368,"line":1577},[1366,3433,3434],{"class":1455},"      max_items",[1366,3436,1462],{"class":1440},[1366,3438,3440],{"class":3439},"s-TwI"," 10000\n",[1366,3442,3443,3446,3448],{"class":1368,"line":1583},[1366,3444,3445],{"class":1455},"      workers",[1366,3447,1462],{"class":1440},[1366,3449,3450],{"class":3439}," 10\n",[538,3452],{},[73,3454,3456],{"id":3455},"getting-application-signals-in","Getting Application Signals In",[69,3458,3459],{},"From an application's perspective, the integration is a single environment variable:",[1330,3461,3463],{"className":1360,"code":3462,"language":1362,"meta":419,"style":419},"OTEL_EXPORTER_OTLP_ENDPOINT=http:\u002F\u002Fmon-node-a:4317\nOTEL_SERVICE_NAME=my-service\n",[126,3464,3465,3477],{"__ignoreMap":419},[1366,3466,3467,3471,3474],{"class":1368,"line":12},[1366,3468,3470],{"class":3469},"svycV","OTEL_EXPORTER_OTLP_ENDPOINT",[1366,3472,3473],{"class":1440},"=",[1366,3475,3476],{"class":1381},"http:\u002F\u002Fmon-node-a:4317\n",[1366,3478,3479,3482,3484],{"class":1368,"line":21},[1366,3480,3481],{"class":3469},"OTEL_SERVICE_NAME",[1366,3483,3473],{"class":1440},[1366,3485,3486],{"class":1381},"my-service\n",[69,3488,3489],{},"The OTel SDK handles the rest. Traces, logs, and metrics all go to the same endpoint and Alloy sorts them.",[69,3491,3492],{},"The central Alloy config receives all three signal types through one receiver and routes each to its backend:",[1330,3494,3496],{"className":2404,"code":3495,"language":2406,"meta":419,"style":419},"otelcol.receiver.otlp \"otlp_receiver\" {\n  grpc { endpoint = \"0.0.0.0:4317\" }\n  http { endpoint = \"0.0.0.0:4318\" }\n  output {\n    traces  = [otelcol.processor.batch.default.input]\n    logs    = [otelcol.processor.batch.default.input]\n    metrics = [otelcol.processor.batch.default.input]\n  }\n}\n",[126,3497,3498,3503,3508,3513,3518,3523,3528,3533,3537],{"__ignoreMap":419},[1366,3499,3500],{"class":1368,"line":12},[1366,3501,3502],{},"otelcol.receiver.otlp \"otlp_receiver\" {\n",[1366,3504,3505],{"class":1368,"line":21},[1366,3506,3507],{},"  grpc { endpoint = \"0.0.0.0:4317\" }\n",[1366,3509,3510],{"class":1368,"line":30},[1366,3511,3512],{},"  http { endpoint = \"0.0.0.0:4318\" }\n",[1366,3514,3515],{"class":1368,"line":39},[1366,3516,3517],{},"  output {\n",[1366,3519,3520],{"class":1368,"line":48},[1366,3521,3522],{},"    traces  = [otelcol.processor.batch.default.input]\n",[1366,3524,3525],{"class":1368,"line":57},[1366,3526,3527],{},"    logs    = [otelcol.processor.batch.default.input]\n",[1366,3529,3530],{"class":1368,"line":1536},[1366,3531,3532],{},"    metrics = [otelcol.processor.batch.default.input]\n",[1366,3534,3535],{"class":1368,"line":1557},[1366,3536,1586],{},[1366,3538,3539],{"class":1368,"line":1577},[1366,3540,2423],{},[69,3542,3543,3544,3547,3548,129],{},"After batching, signals split to their respective exporters: traces to the Tempo distributor via OTLP, logs to Loki via ",[126,3545,3546],{},"loki.write",", application metrics to Prometheus via ",[126,3549,2479],{},[69,3551,3552],{},"The OTel to Alloy to Tempo path worked on the first proper attempt — the pipeline model makes the data flow explicit enough that when something isn't arriving where you expect it, it's usually obvious which component in the chain is the problem.",[538,3554],{},[73,3556,3558],{"id":3557},"connecting-everything-in-grafana","Connecting Everything in Grafana",[69,3560,3561],{},"Three data sources on mon-node-b:",[91,3563,3564,3569,3574],{},[94,3565,3566,3568],{},[81,3567,971],{}," — host metrics, application metrics, and the RED metrics Tempo generates",[94,3570,3571,3573],{},[81,3572,3227],{}," — application logs",[94,3575,3576,3578],{},[81,3577,3276],{}," — distributed traces",[69,3580,3581],{},"The part that makes these three genuinely useful together rather than just three separate views is derived fields in Loki. Any log line containing a trace ID becomes a clickable link to that trace in Tempo:",[1330,3583,3586],{"className":3584,"code":3585,"language":1335},[1333],"Field name: traceId\nRegex: traceId=(\\w+)\nInternal link: Tempo → ${__value.raw}\n",[126,3587,3585],{"__ignoreMap":419},[69,3589,3590],{},"From a trace in Tempo you can navigate back to the Loki logs for that service in the same time window. The three signals become navigable together rather than three separate places to look.",[538,3592],{},[73,3594,2284],{"id":2283},[69,3596,3597],{},"The stack now covers all three observability pillars. Host health and availability through Prometheus and Alloy on each node, unchanged from Part 2. Application logs through Loki. Distributed traces through Tempo, with RED metrics derived automatically. All queryable from Grafana with the signals linked to each other.",[69,3599,3600],{},"What was still manual: enrolling a new host still meant SSHing in, installing Alloy, writing its config, creating a target file, and reloading Prometheus. That friction was the last remaining operational problem — and fixing it turned into something bigger than just a script.",[69,3602,3603],{},[81,3604,2983],{},[454,3606],{"path":3607},"\u002Fblog\u002Flgtm-stack\u002Fpart-4",[2304,3609,3610],{},"html pre.shiki code .su6XF, html code.shiki .su6XF{--shiki-default:#998418}html pre.shiki code .sYZai, html code.shiki .sYZai{--shiki-default:#999999}html pre.shiki code .spphp, html code.shiki .spphp{--shiki-default:#B56959}html pre.shiki code .sbBg2, html code.shiki .sbBg2{--shiki-default:#1E754F}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .s-TwI, html code.shiki .s-TwI{--shiki-default:#2F798A}html pre.shiki code .svycV, html code.shiki .svycV{--shiki-default:#B07D48}",{"title":419,"searchDepth":21,"depth":21,"links":3612},[3613,3614,3615,3616,3617,3618,3619,3620],{"id":3033,"depth":21,"text":3034},{"id":3045,"depth":21,"text":3046},{"id":3081,"depth":21,"text":3082},{"id":3226,"depth":21,"text":3227},{"id":3097,"depth":21,"text":3276},{"id":3455,"depth":21,"text":3456},{"id":3557,"depth":21,"text":3558},{"id":2283,"depth":21,"text":2284},"2026-05-19","Adding log aggregation with Loki and distributed tracing with Tempo — completing the metrics, logs, and traces picture.",{},{"title":3013,"description":3622},"blog\u002Flgtm-stack\u002Fpart-3",[2328,3227,3276,1215,1216,979,2329],"\u002Fimages\u002Fthumbnails\u002Flgtm-stack\u002Fpart-3.png","9s48r2qM_PDpsOglDZPADwXDaUQsdhpZExXgq7Db3Q0",{"id":3630,"title":3631,"body":3632,"category":2319,"date":3872,"description":3873,"extension":434,"meta":3874,"navigation":436,"path":3607,"readTime":2324,"seo":3875,"stem":3876,"tags":3877,"thumbnail":3880,"__hash__":3881},"blog\u002Fblog\u002Flgtm-stack\u002Fpart-4.md","Building a Production Monitoring Stack from Scratch — Part 4: The Enrollment API",{"type":66,"value":3633,"toc":3863},[3634,3643,3645,3647,3649,3651,3655,3658,3684,3687,3690,3692,3696,3699,3702,3705,3708,3710,3714,3717,3720,3723,3725,3729,3732,3761,3767,3770,3777,3780,3782,3786,3793,3796,3799,3801,3805,3808,3818,3821,3824,3826,3830,3833,3839,3848,3854,3860],[1226,3635,3636],{},[69,3637,3638,1233,3640],{},[81,3639,1232],{},[81,3641,3642],{},"Part 4 of 4",[454,3644],{"path":2323},[454,3646],{"path":2302},[454,3648],{"path":2986},[538,3650],{},[73,3652,3654],{"id":3653},"the-remaining-friction","The Remaining Friction",[69,3656,3657],{},"Three parts in, the monitoring stack was complete in terms of capability. But adding a new host still looked like this:",[256,3659,3660,3663,3666,3669,3672,3675,3678,3681],{},[94,3661,3662],{},"SSH onto the host",[94,3664,3665],{},"Install Grafana Alloy",[94,3667,3668],{},"Write the Alloy config for that host's specific services",[94,3670,3671],{},"Start the service",[94,3673,3674],{},"Write a Prometheus target JSON file with the host's full metadata",[94,3676,3677],{},"Place that file in the targets directory on mon-node-a",[94,3679,3680],{},"Reload Prometheus",[94,3682,3683],{},"Verify the host appeared in Grafana",[69,3685,3686],{},"At ten hosts this is manageable. We were heading toward significantly more than ten, across different subnets, different OS versions, different service combinations. Each enrollment was a context switch — SSH session, config writing, file placement — and each one was another opportunity for a typo in a label that would surface as a confusing gap in a dashboard weeks later.",[69,3688,3689],{},"The obvious answer was automation. The question was what form it should take.",[538,3691],{},[73,3693,3695],{"id":3694},"why-not-ansible","Why Not Ansible",[69,3697,3698],{},"Ansible was the straightforward choice. Write a playbook, run it against a host, done. It's what most teams would reach for.",[69,3700,3701],{},"The problem was the operational model it would create. A playbook lives in a repository. Adding a host means committing a vars file, or updating an inventory, pushing to a repo, waiting for a pipeline. Secrets need to be encrypted, which means setting up Vault or ansible-vault, wiring that into CI, making sure anyone who needs to enroll a host has access to the right keys. You've traded one kind of manual work for another.",[69,3703,3704],{},"It also meant the monitoring stack would have an external dependency — a separate repository, a CI system — just to add a host to Prometheus. If the pipeline was down, enrollment was blocked.",[69,3706,3707],{},"What I wanted was something self-contained that lived with the monitoring stack and could be used by anyone with access to Grafana, without needing to touch a repository or understand the underlying infrastructure.",[538,3709],{},[73,3711,3713],{"id":3712},"the-grafana-plugin-insight","The Grafana Plugin Insight",[69,3715,3716],{},"Grafana has a plugin system. You can build a custom frontend that installs directly into Grafana as a panel or app, with its own pages, its own navigation, its own UI. It appears in the sidebar alongside Dashboards and Alerting as if it were a native part of the product.",[69,3718,3719],{},"That was the piece that made the whole thing click. If I built an enrollment API and a Grafana plugin that called it, the entire workflow would live inside the tool the team was already using. No separate app to navigate to, no CLI to remember. Just a form in Grafana: fill in the host details, submit, host is monitored.",[69,3721,3722],{},"I'd built APIs in Python before and had experience writing Python to SSH into hosts and execute commands — that background came from working on a CD pipeline from scratch. The backend wasn't the unknown part. The Grafana plugin was new territory.",[538,3724],{},[73,3726,3728],{"id":3727},"the-api","The API",[69,3730,3731],{},"The backend is a Python API. An enrollment request comes in with the host's IP, SSH credentials, the services running on it, and the label metadata to attach to it in Prometheus. The API then:",[256,3733,3734,3737,3740,3743,3746,3752,3755,3758],{},[94,3735,3736],{},"Connects to the host via SSH (key or password)",[94,3738,3739],{},"Detects the OS — Debian, RHEL, SUSE, Windows each have different package managers and service managers",[94,3741,3742],{},"Installs Grafana Alloy if not present, or validates the existing installation",[94,3744,3745],{},"Generates an Alloy config appropriate for that host and its services from templates",[94,3747,3748,3749,3751],{},"Deploys the config and validates it by running ",[126,3750,648],{}," on the remote host before restarting the service",[94,3753,3754],{},"Writes the Prometheus target file with the full label set",[94,3756,3757],{},"Reloads Prometheus via its HTTP API",[94,3759,3760],{},"Returns a structured response",[69,3762,3763,3764,3766],{},"The validation step — running ",[126,3765,648],{}," on the host itself before restarting Alloy — was an early decision that proved its worth. Config template bugs would otherwise surface as a silent Alloy failure: service appears to restart, metrics stop appearing, nothing in the logs that makes the cause obvious. Catching the syntax error before committing to it saved that confusion more than once.",[69,3768,3769],{},"Enrollment is idempotent. Running it against an existing host checks what's already there, updates what's changed, and skips what hasn't. Re-enrolling a host after a template update is a normal operation, not a risky one.",[69,3771,3772,3773,3776],{},"For hosts being decommissioned, the API renames the target file with a ",[126,3774,3775],{},".deleted.\u003Ctimestamp>"," suffix rather than deleting it. The targets directory ends up as a passive audit trail — you can see every host that was ever enrolled and when it was removed, without digging through logs.",[69,3778,3779],{},"The API also handles batch enrollment — a list of hosts processed concurrently up to a configurable limit, with per-host status tracking so failed hosts can be retried independently.",[538,3781],{},[73,3783,3785],{"id":3784},"the-grafana-plugin","The Grafana Plugin",[69,3787,3788,3789,3792],{},"The plugin is a Grafana app plugin — it installs into Grafana and adds pages to the sidebar. The enrollment form lives at ",[126,3790,3791],{},"\u002Fgrafana\u002Fa\u002Firis\u002Fenroll",". It has fields for connection details, host type, SSH credentials, service configuration, and the label metadata that ends up in Prometheus.",[69,3794,3795],{},"Building a Grafana plugin for the first time meant learning the plugin SDK, understanding how Grafana's frontend architecture works, and figuring out how to wire API calls through Grafana's proxy so the backend isn't exposed directly. None of that was especially difficult, but it was all new, and the documentation for app plugins is thinner than for panel plugins.",[69,3797,3798],{},"The result is that enrollment happens entirely within Grafana. An operator fills in the form, hits submit, and within about 30 seconds the host appears in the fleet dashboard. The underlying SSH, config generation, and Prometheus reload are invisible. The plugin also has pages for viewing enrolled hosts, managing labels, and handling batch enrollments from a file upload.",[538,3800],{},[73,3802,3804],{"id":3803},"what-changed","What Changed",[69,3806,3807],{},"Before the API, enrolling a host was a sequence of manual steps spread across multiple systems. After it, the same operation takes 30 seconds and happens inside the tool the team already has open.",[69,3809,3810,3811,3814,3815,3817],{},"The label consistency improved noticeably. When metadata is entered through a form with defined fields rather than hand-written into a JSON file, the alert annotations and dashboard filters stay clean. No more ",[126,3812,3813],{},"maintainer"," vs ",[126,3816,1562],{}," label mismatches surfacing weeks later.",[69,3819,3820],{},"The audit trail in the targets directory — active files, deleted files with timestamps — became useful almost immediately. During a network audit, being able to answer \"when was this host enrolled and when was it removed\" from the directory listing alone, without touching logs or databases, turned out to be genuinely handy.",[69,3822,3823],{},"The API also became the foundation for more. Once you have a reliable programmatic path into the monitoring stack, other things become possible — automated enrollment from infrastructure provisioning, health checks, label updates when service ownership changes. That expansion became its own project.",[538,3825],{},[73,3827,3829],{"id":3828},"closing-the-series","Closing the Series",[69,3831,3832],{},"The series started with the question of whether an open-source observability stack could genuinely replace NagiosXI in production. By Part 4, the answer was clearly yes — and the result had gone further than parity.",[69,3834,3835,3838],{},[81,3836,3837],{},"Part 1"," — Prometheus, Grafana, Node Exporter, AlertManager. Functional host monitoring, replacing what NagiosXI did.",[69,3840,3841,3844,3845,3847],{},[81,3842,3843],{},"Part 2"," — Grafana Alloy replacing Node Exporter. One agent per host, the push vs pull problem, recovering the ",[126,3846,1685],{}," metric.",[69,3849,3850,3853],{},[81,3851,3852],{},"Part 3"," — Loki and Tempo. Logs and distributed traces alongside metrics, all queryable from Grafana with signals linked to each other.",[69,3855,3856,3859],{},[81,3857,3858],{},"Part 4"," — The enrollment API and Grafana plugin. The operational friction of adding hosts, eliminated.",[69,3861,3862],{},"The stack covers the full production fleet with metrics, logs, and traces. Alerts are accurate. Enrollment takes under a minute. The whole thing runs on open-source software with no licensing costs.",{"title":419,"searchDepth":21,"depth":21,"links":3864},[3865,3866,3867,3868,3869,3870,3871],{"id":3653,"depth":21,"text":3654},{"id":3694,"depth":21,"text":3695},{"id":3712,"depth":21,"text":3713},{"id":3727,"depth":21,"text":3728},{"id":3784,"depth":21,"text":3785},{"id":3803,"depth":21,"text":3804},{"id":3828,"depth":21,"text":3829},"2026-05-20","Eliminating manual SSH work — and why the solution ended up living inside Grafana.",{},{"title":3631,"description":3873},"blog\u002Flgtm-stack\u002Fpart-4",[2328,971,1216,2329,3878,3879,1215],"Automation","API","\u002Fimages\u002Fthumbnails\u002Flgtm-stack\u002Fpart-4.png","5IGjILaaxONrcL_a3lge9njzhMlPwFM5F8Xz1yvMaI8",{"id":3883,"title":3884,"body":3885,"category":2319,"date":5340,"description":5341,"extension":434,"meta":5342,"navigation":436,"path":5343,"readTime":5344,"seo":5345,"stem":5346,"tags":5347,"thumbnail":5351,"__hash__":5352},"blog\u002Fblog\u002Fmtu-troubleshooting\u002Findex.md","When Packets Disappear: Debugging an MTU Mismatch in a Hybrid OpenStack Docker Swarm",{"type":66,"value":3886,"toc":5321},[3887,3892,3894,3898,3901,3904,3907,3913,3916,3922,3929,3932,3934,3938,3941,3944,3947,3949,3953,3956,3959,3964,3981,3984,3991,3994,4008,4037,4040,4042,4046,4049,4052,4054,4058,4061,4098,4105,4133,4138,4145,4156,4158,4162,4165,4212,4219,4225,4227,4231,4234,4291,4301,4304,4310,4313,4316,4318,4322,4329,4335,4342,4348,4350,4354,4357,4409,4412,4416,4419,4425,4430,4433,4443,4478,4484,4486,4490,4493,4508,4511,4514,4531,4534,4673,4676,4678,4682,4685,4802,4808,4810,4814,4817,4824,5149,5159,5166,5168,5172,5179,5205,5208,5210,5214,5220,5226,5232,5238,5244,5246,5250,5315,5318],[69,3888,3889],{},[1673,3890,3891],{},"A real-world troubleshooting story about silent packet drops, floating IPs, and why the obvious fix is not always the right one.",[538,3893],{},[73,3895,3897],{"id":3896},"background-why-we-have-a-hybrid-setup","Background: Why We Have a Hybrid Setup",[69,3899,3900],{},"Our infrastructure runs on a hybrid model — some nodes live inside an OpenStack private cloud, and some live on bare metal servers outside of it. This was not an accident or an oversight. It was a deliberate architectural decision born out of caution.",[69,3902,3903],{},"OpenStack is powerful, but like any cloud platform, it can have outages, networking hiccups, or capacity issues — especially when it is still being evaluated in production for the first time. We wanted a safety net. If the OpenStack environment had a bad day, our core orchestration layer would still be standing.",[69,3905,3906],{},"So our Docker Swarm cluster looks like this:",[1330,3908,3911],{"className":3909,"code":3910,"language":1335},[1333],"Outside OpenStack (bare metal):\n  - 2 Swarm Manager nodes\n  - Keepalived (for VIP failover)\n  - HAProxy (load balancer)\n  - Traefik (reverse proxy and service router)\n  - 1 Data node (persistent storage workloads)\n  - 1 Monitoring node (observability stack)\n\nInside OpenStack:\n  - 1 Production node (email API and other prod services)\n  - 1 Test node (test workloads)\n  - ASG worker nodes (auto-scaled up and down based on CPU and memory)\n",[126,3912,3910],{"__ignoreMap":419},[69,3914,3915],{},"Traffic flows like this for any external request:",[1330,3917,3920],{"className":3918,"code":3919,"language":1335},[1333],"Internet → VIP → HAProxy → Traefik → Service container\n",[126,3921,3919],{"__ignoreMap":419},[69,3923,3924,3925,3928],{},"Services talk to each other using their Traefik URLs, for example ",[126,3926,3927],{},"https:\u002F\u002Femail-service\u002Fapi\u002Fsend",". Traefik handles the routing based on labels attached to each Docker service.",[69,3930,3931],{},"This setup has been working well. Until one day, emails stopped sending.",[538,3933],{},[73,3935,3937],{"id":3936},"the-problem-emails-timing-out-silently","The Problem: Emails Timing Out Silently",[69,3939,3940],{},"A service running on the test node was making POST requests to the email service. The email service is hosted on the production node inside OpenStack. The requests were timing out on the calling side, and the email service was not receiving anything at all — no logs, no errors, nothing. It was as if the requests were vanishing into thin air.",[69,3942,3943],{},"Meanwhile, the monitoring node — which also sends emails (a daily digest with AI-generated summaries and HTML content) — was working perfectly fine.",[69,3945,3946],{},"That asymmetry was the first interesting clue.",[538,3948],{},[73,3950,3952],{"id":3951},"first-hypothesis-docker-overlay-mtu-problem","First Hypothesis: Docker Overlay MTU Problem",[69,3954,3955],{},"The initial instinct was a classic Docker Swarm networking issue. When Docker creates overlay networks (the virtual networks that let containers on different physical nodes talk to each other), it assumes the underlying network can carry standard Ethernet frames of 1500 bytes.",[69,3957,3958],{},"But OpenStack's virtual network adds its own wrapper around every packet. Technologies like VXLAN or Geneve are used to tunnel traffic between virtual machines, and that tunnelling eats into the available space:",[1226,3960,3961],{},[69,3962,3963],{},"Think of it like putting a letter inside an envelope, and then putting that envelope inside a bigger envelope to mail it. The outer envelope takes up space, so the inner letter has to be smaller.",[91,3965,3966,3972,3975],{},[94,3967,3968,3969],{},"Standard Ethernet MTU: ",[81,3970,3971],{},"1500 bytes",[94,3973,3974],{},"VXLAN overhead: ~50 bytes",[94,3976,3977,3978],{},"Effective MTU on OpenStack: ",[81,3979,3980],{},"~1450 bytes",[69,3982,3983],{},"If Docker thinks it can send 1500-byte packets but the network can only carry 1450, oversized packets get silently dropped. No error. No ICMP \"too big\" message. Just gone.",[69,3985,3986,3987,3990],{},"This is called an ",[81,3988,3989],{},"MTU mismatch",", and it is a well-known pain point in containerised environments running on top of virtualised networks.",[69,3992,3993],{},"The standard fix for this is:",[256,3995,3996,4002,4005],{},[94,3997,3998,3999],{},"Tell Docker to use a smaller MTU in ",[126,4000,4001],{},"\u002Fetc\u002Fdocker\u002Fdaemon.json",[94,4003,4004],{},"Recreate the overlay networks with the correct MTU",[94,4006,4007],{},"Recreate the ingress network",[1330,4009,4011],{"className":1431,"code":4010,"language":1433,"meta":419,"style":419},"{\n  \"mtu\": 1400\n}\n",[126,4012,4013,4018,4033],{"__ignoreMap":419},[1366,4014,4015],{"class":1368,"line":12},[1366,4016,4017],{"class":1440},"{\n",[1366,4019,4020,4023,4026,4028,4030],{"class":1368,"line":21},[1366,4021,4022],{"class":1451},"  \"",[1366,4024,4025],{"class":1455},"mtu",[1366,4027,1459],{"class":1451},[1366,4029,1462],{"class":1440},[1366,4031,4032],{"class":3439}," 1400\n",[1366,4034,4035],{"class":1368,"line":30},[1366,4036,2423],{"class":1440},[69,4038,4039],{},"But this approach had a serious problem for our environment.",[538,4041],{},[73,4043,4045],{"id":4044},"why-the-standard-fix-was-not-viable","Why the Standard Fix Was Not Viable",[69,4047,4048],{},"We have a lot of services deployed across many nodes. Recreating the Docker ingress network requires all nodes to temporarily lose port routing. Recreating overlay networks means services get restarted. With dozens of services and several nodes, this would mean significant downtime.",[69,4050,4051],{},"We needed to think more carefully before touching anything.",[538,4053],{},[73,4055,4057],{"id":4056},"digging-deeper-a-ping-test-reveals-the-truth","Digging Deeper: A Ping Test Reveals the Truth",[69,4059,4060],{},"Before making any changes, we ran a diagnostic test on the test node — the one where emails were failing:",[1330,4062,4064],{"className":1360,"code":4063,"language":1362,"meta":419,"style":419},"docker run --rm alpine ping -c 5 -s 1200 8.8.8.8\n",[126,4065,4066],{"__ignoreMap":419},[1366,4067,4068,4071,4074,4077,4080,4083,4086,4089,4092,4095],{"class":1368,"line":12},[1366,4069,4070],{"class":1377},"docker",[1366,4072,4073],{"class":1381}," run",[1366,4075,4076],{"class":1407}," --rm",[1366,4078,4079],{"class":1381}," alpine",[1366,4081,4082],{"class":1381}," ping",[1366,4084,4085],{"class":1407}," -c",[1366,4087,4088],{"class":3439}," 5",[1366,4090,4091],{"class":1407}," -s",[1366,4093,4094],{"class":3439}," 1200",[1366,4096,4097],{"class":3439}," 8.8.8.8\n",[69,4099,4100,4101,4104],{},"Result: ",[81,4102,4103],{},"5\u002F5 packets received."," Fine.",[1330,4106,4108],{"className":1360,"code":4107,"language":1362,"meta":419,"style":419},"docker run --rm alpine ping -c 5 -s 1472 8.8.8.8\n",[126,4109,4110],{"__ignoreMap":419},[1366,4111,4112,4114,4116,4118,4120,4122,4124,4126,4128,4131],{"class":1368,"line":12},[1366,4113,4070],{"class":1377},[1366,4115,4073],{"class":1381},[1366,4117,4076],{"class":1407},[1366,4119,4079],{"class":1381},[1366,4121,4082],{"class":1381},[1366,4123,4085],{"class":1407},[1366,4125,4088],{"class":3439},[1366,4127,4091],{"class":1407},[1366,4129,4130],{"class":3439}," 1472",[1366,4132,4097],{"class":3439},[69,4134,4100,4135],{},[81,4136,4137],{},"0\u002F5 packets received. 100% loss.",[69,4139,4140,4141,4144],{},"This is significant. The ",[126,4142,4143],{},"-s"," flag sets the packet payload size. Adding 28 bytes for the ICMP and IP headers, a payload of 1472 bytes makes a total packet size of exactly 1500 bytes — the standard Ethernet MTU.",[69,4146,4147,4148,4151,4152,4155],{},"So anything at or near a full-size Ethernet frame was being completely dropped when leaving the OpenStack node. This confirmed there was an MTU problem, but the question was: ",[1673,4149,4150],{},"where exactly"," was it happening, and ",[1673,4153,4154],{},"why"," was it only affecting the test node and not the monitoring node?",[538,4157],{},[73,4159,4161],{"id":4160},"the-key-asymmetry-inside-vs-outside-openstack","The Key Asymmetry: Inside vs Outside OpenStack",[69,4163,4164],{},"Let us look at what was different between the working and failing cases:",[873,4166,4167,4180],{},[876,4168,4169],{},[879,4170,4171,4174,4177],{},[882,4172,4173],{},"Source",[882,4175,4176],{},"Destination",[882,4178,4179],{},"Result",[889,4181,4182,4193,4202],{},[879,4183,4184,4187,4190],{},[894,4185,4186],{},"Monitoring node (outside OpenStack)",[894,4188,4189],{},"Email service (inside OpenStack)",[894,4191,4192],{},"✅ Works",[879,4194,4195,4198,4200],{},[894,4196,4197],{},"Old test node (outside OpenStack)",[894,4199,4189],{},[894,4201,4192],{},[879,4203,4204,4207,4209],{},[894,4205,4206],{},"New test node (inside OpenStack)",[894,4208,4189],{},[894,4210,4211],{},"❌ Fails",[69,4213,4214,4215,4218],{},"The common variable is not the payload size. The monitoring node sends large HTML emails and they go through fine. The common variable is ",[81,4216,4217],{},"where the request originates",". Everything originating from inside OpenStack to the email service was failing.",[69,4220,4221,4222,129],{},"This pointed away from a Docker overlay problem and toward a ",[81,4223,4224],{},"network path problem specific to OpenStack",[538,4226],{},[73,4228,4230],{"id":4229},"the-real-traffic-path-a-surprising-discovery","The Real Traffic Path: A Surprising Discovery",[69,4232,4233],{},"Here is where the architecture revealed something unexpected. The OpenStack nodes join the Swarm like this:",[1330,4235,4237],{"className":1360,"code":4236,"language":1362,"meta":419,"style":419},"docker swarm join --token \"$TOKEN\" \"$MANAGER\" \\\n  --advertise-addr \"$FLOATING_IP\" \\\n  --listen-addr 0.0.0.0:2377\n",[126,4238,4239,4269,4283],{"__ignoreMap":419},[1366,4240,4241,4243,4246,4249,4252,4254,4257,4259,4261,4264,4266],{"class":1368,"line":12},[1366,4242,4070],{"class":1377},[1366,4244,4245],{"class":1381}," swarm",[1366,4247,4248],{"class":1381}," join",[1366,4250,4251],{"class":1407}," --token",[1366,4253,1505],{"class":1468},[1366,4255,4256],{"class":1381},"$TOKEN",[1366,4258,1459],{"class":1468},[1366,4260,1505],{"class":1468},[1366,4262,4263],{"class":1381},"$MANAGER",[1366,4265,1459],{"class":1468},[1366,4267,4268],{"class":1407}," \\\n",[1366,4270,4271,4274,4276,4279,4281],{"class":1368,"line":21},[1366,4272,4273],{"class":1407},"  --advertise-addr",[1366,4275,1505],{"class":1468},[1366,4277,4278],{"class":1381},"$FLOATING_IP",[1366,4280,1459],{"class":1468},[1366,4282,4268],{"class":1407},[1366,4284,4285,4288],{"class":1368,"line":30},[1366,4286,4287],{"class":1407},"  --listen-addr",[1366,4289,4290],{"class":1381}," 0.0.0.0:2377\n",[69,4292,1682,4293,4296,4297,4300],{},[126,4294,4295],{},"--advertise-addr"," is set to the node's ",[81,4298,4299],{},"floating IP"," — its external, publicly routable IP address. This was necessary because the Swarm managers live outside OpenStack, and the only way for an OpenStack node to reach them is via the external network.",[69,4302,4303],{},"But this has a side effect. Every other node in the Swarm — including other OpenStack nodes on the same internal subnet — now thinks the only way to reach that node is via its floating IP. So when the test node talks to the email service, even though they are on the same internal OpenStack network, the traffic takes this path:",[1330,4305,4308],{"className":4306,"code":4307,"language":1335},[1333],"Test node (inside OpenStack)\n  → exits via floating IP through OpenStack router (NAT)\n    → hits external network\n      → HAProxy\n        → Traefik\n          → re-enters OpenStack via prod node floating IP (NAT)\n            → Email service container\n",[126,4309,4307],{"__ignoreMap":419},[69,4311,4312],{},"Two nodes sitting on the same internal subnet are taking a round trip through the external network to talk to each other. And each time a packet crosses the OpenStack network boundary through NAT, it picks up more overhead.",[69,4314,4315],{},"The monitoring node works because it is already outside OpenStack. Its traffic only crosses the boundary once — going in. No double NAT, less encapsulation pressure on each packet.",[538,4317],{},[73,4319,4321],{"id":4320},"confirming-with-interface-inspection","Confirming With Interface Inspection",[69,4323,4324,4325,4328],{},"Running ",[126,4326,4327],{},"ip link show"," on the test node made the mismatch immediately visible:",[1330,4330,4333],{"className":4331,"code":4332,"language":1335},[1333],"ens3:            MTU 1450   ← OpenStack network interface\ndocker0:         MTU 1500   ← Docker bridge, unaware\ndocker_gwbridge: MTU 1500   ← Docker gateway bridge, also unaware\nveth*:           MTU 1500   ← All container interfaces, also unaware\n",[126,4334,4332],{"__ignoreMap":419},[69,4336,4337,4338,4341],{},"The host network interface ",[126,4339,4340],{},"ens3"," is correctly at 1450 — OpenStack set it that way to account for VXLAN overhead. But every Docker interface on the same node is at 1500. Docker was never told about the OpenStack constraint.",[69,4343,4344,4345,4347],{},"So when a container builds a packet, it thinks it has 1500 bytes to work with. That packet travels through the veth interface, through the Docker gateway bridge, and then hits ",[126,4346,4340],{}," — which can only carry 1450 bytes. The oversized packet hits the wall and is silently dropped.",[538,4349],{},[73,4351,4353],{"id":4352},"the-iptables-fix-that-worked","The Iptables Fix That Worked",[69,4355,4356],{},"Before understanding all of this fully, an iptables rule was applied on the test node:",[1330,4358,4360],{"className":1360,"code":4359,"language":1362,"meta":419,"style":419},"sudo iptables -t mangle -I FORWARD -p tcp --tcp-flags SYN,RST SYN -j TCPMSS --set-mss 1360\n",[126,4361,4362],{"__ignoreMap":419},[1366,4363,4364,4367,4370,4373,4376,4379,4382,4385,4388,4391,4394,4397,4400,4403,4406],{"class":1368,"line":12},[1366,4365,4366],{"class":1377},"sudo",[1366,4368,4369],{"class":1381}," iptables",[1366,4371,4372],{"class":1407}," -t",[1366,4374,4375],{"class":1381}," mangle",[1366,4377,4378],{"class":1407}," -I",[1366,4380,4381],{"class":1381}," FORWARD",[1366,4383,4384],{"class":1407}," -p",[1366,4386,4387],{"class":1381}," tcp",[1366,4389,4390],{"class":1407}," --tcp-flags",[1366,4392,4393],{"class":1381}," SYN,RST",[1366,4395,4396],{"class":1381}," SYN",[1366,4398,4399],{"class":1407}," -j",[1366,4401,4402],{"class":1381}," TCPMSS",[1366,4404,4405],{"class":1407}," --set-mss",[1366,4407,4408],{"class":3439}," 1360\n",[69,4410,4411],{},"And the emails started going through immediately.",[86,4413,4415],{"id":4414},"what-does-this-actually-do","What Does This Actually Do?",[69,4417,4418],{},"To understand this fix, you need to know a little about how TCP connections work.",[69,4420,4421,4422,129],{},"When two machines want to talk over TCP (the protocol used for HTTP, HTTPS, and most internet traffic), they start with a handshake. During this handshake, both sides announce the largest chunk of data they are willing to receive at once. This is called the ",[81,4423,4424],{},"Maximum Segment Size (MSS)",[1226,4426,4427],{},[69,4428,4429],{},"Think of it like two people agreeing on how many items to pass at once down a conveyor belt. If you agree on small batches, nothing gets dropped even if the belt has a narrow section somewhere in the middle.",[69,4431,4432],{},"The iptables rule intercepts the very first packet of every TCP connection (the SYN packet), and rewrites the MSS value to something smaller. Both sides then negotiate based on that smaller value, and the entire connection uses smaller chunks from the start. The oversized packet problem never occurs because the data is broken into pieces that fit.",[69,4434,1682,4435,4438,4439,4442],{},[126,4436,4437],{},"--set-mss 1360"," hardcodes the MSS to 1360 bytes. It works, but a smarter version uses ",[126,4440,4441],{},"--clamp-mss-to-pmtu"," instead:",[1330,4444,4446],{"className":1360,"code":4445,"language":1362,"meta":419,"style":419},"iptables -t mangle -I FORWARD -p tcp --tcp-flags SYN,RST SYN -j TCPMSS --clamp-mss-to-pmtu\n",[126,4447,4448],{"__ignoreMap":419},[1366,4449,4450,4453,4455,4457,4459,4461,4463,4465,4467,4469,4471,4473,4475],{"class":1368,"line":12},[1366,4451,4452],{"class":1377},"iptables",[1366,4454,4372],{"class":1407},[1366,4456,4375],{"class":1381},[1366,4458,4378],{"class":1407},[1366,4460,4381],{"class":1381},[1366,4462,4384],{"class":1407},[1366,4464,4387],{"class":1381},[1366,4466,4390],{"class":1407},[1366,4468,4393],{"class":1381},[1366,4470,4396],{"class":1381},[1366,4472,4399],{"class":1407},[1366,4474,4402],{"class":1381},[1366,4476,4477],{"class":1407}," --clamp-mss-to-pmtu\n",[69,4479,4480,4481,4483],{},"This tells the kernel to calculate the correct MSS automatically based on the actual outgoing interface MTU (1450 on ",[126,4482,4340],{},"), rather than using a hardcoded value. If the network MTU ever changes, the rule adapts automatically.",[538,4485],{},[73,4487,4489],{"id":4488},"why-this-fix-and-not-the-docker-mtu-fix","Why This Fix and Not the Docker MTU Fix?",[69,4491,4492],{},"This is the important question. We could have:",[256,4494,4495,4502,4505],{},[94,4496,4497,4498,4501],{},"Changed ",[126,4499,4500],{},"daemon.json"," to set Docker MTU to 1400",[94,4503,4504],{},"Recreated all overlay networks",[94,4506,4507],{},"Recreated the ingress network",[69,4509,4510],{},"But that approach would have caused significant downtime across all services for a problem that only affects outbound TCP from OpenStack nodes. It is the right fix if your Docker overlay traffic between nodes is dropping. It is overkill — and risky — when the actual problem is a specific outbound path.",[69,4512,4513],{},"The iptables TCPMSS approach:",[91,4515,4516,4519,4522,4525,4528],{},[94,4517,4518],{},"Touches nothing else in the stack",[94,4520,4521],{},"Requires no service restarts",[94,4523,4524],{},"Requires no network recreation",[94,4526,4527],{},"Only affects outbound TCP SYN packets from that node",[94,4529,4530],{},"Is invisible to services and containers",[69,4532,4533],{},"We confirmed this by checking the iptables rule counters after applying it:",[1330,4535,4537],{"className":1360,"code":4536,"language":1362,"meta":419,"style":419},"sudo iptables -t mangle -L FORWARD -n -v --line-numbers\n\nChain FORWARD (policy ACCEPT 5894K packets, 2970M bytes)\nnum   pkts bytes target     prot opt in     out     source               destination\n1        6   360 TCPMSS     6    --  *      *       0.0.0.0\u002F0  0.0.0.0\u002F0  tcp flags:0x06\u002F0x02 TCPMSS clamp to PMTU\n",[126,4538,4539,4563,4567,4595,4626],{"__ignoreMap":419},[1366,4540,4541,4543,4545,4547,4549,4552,4554,4557,4560],{"class":1368,"line":12},[1366,4542,4366],{"class":1377},[1366,4544,4369],{"class":1381},[1366,4546,4372],{"class":1407},[1366,4548,4375],{"class":1381},[1366,4550,4551],{"class":1407}," -L",[1366,4553,4381],{"class":1381},[1366,4555,4556],{"class":1407}," -n",[1366,4558,4559],{"class":1407}," -v",[1366,4561,4562],{"class":1407}," --line-numbers\n",[1366,4564,4565],{"class":1368,"line":21},[1366,4566,1895],{"emptyLinePlaceholder":436},[1366,4568,4569,4572,4574,4577,4580,4583,4586,4589,4592],{"class":1368,"line":30},[1366,4570,4571],{"class":1377},"Chain",[1366,4573,4381],{"class":1381},[1366,4575,4576],{"class":1392}," (policy ",[1366,4578,4579],{"class":1381},"ACCEPT",[1366,4581,4582],{"class":1381}," 5894K",[1366,4584,4585],{"class":1381}," packets,",[1366,4587,4588],{"class":1381}," 2970M",[1366,4590,4591],{"class":1381}," bytes",[1366,4593,4594],{"class":1392},")\n",[1366,4596,4597,4600,4603,4605,4608,4611,4614,4617,4620,4623],{"class":1368,"line":39},[1366,4598,4599],{"class":1377},"num",[1366,4601,4602],{"class":1381},"   pkts",[1366,4604,4591],{"class":1381},[1366,4606,4607],{"class":1381}," target",[1366,4609,4610],{"class":1381},"     prot",[1366,4612,4613],{"class":1381}," opt",[1366,4615,4616],{"class":1381}," in",[1366,4618,4619],{"class":1381},"     out",[1366,4621,4622],{"class":1381},"     source",[1366,4624,4625],{"class":1381},"               destination\n",[1366,4627,4628,4630,4633,4636,4638,4641,4644,4647,4650,4653,4656,4659,4662,4664,4667,4670],{"class":1368,"line":48},[1366,4629,1831],{"class":1377},[1366,4631,4632],{"class":3439},"        6",[1366,4634,4635],{"class":3439},"   360",[1366,4637,4402],{"class":1381},[1366,4639,4640],{"class":3439},"     6",[1366,4642,4643],{"class":1407},"    --",[1366,4645,4646],{"class":1407},"  *",[1366,4648,4649],{"class":1407},"      *",[1366,4651,4652],{"class":1381},"       0.0.0.0\u002F0",[1366,4654,4655],{"class":1381},"  0.0.0.0\u002F0",[1366,4657,4658],{"class":1381},"  tcp",[1366,4660,4661],{"class":1381}," flags:0x06\u002F0x02",[1366,4663,4402],{"class":1381},[1366,4665,4666],{"class":1381}," clamp",[1366,4668,4669],{"class":1381}," to",[1366,4671,4672],{"class":1381}," PMTU\n",[69,4674,4675],{},"Only 6 packets — just the email test traffic. Browser traffic serving the client app was not going through the rule at all. The fix was surgical.",[538,4677],{},[73,4679,4681],{"id":4680},"making-it-persistent-the-manual-node","Making It Persistent: The Manual Node",[69,4683,4684],{},"The iptables rule applied manually disappears on reboot. For the manually provisioned test node, the fix is:",[1330,4686,4688],{"className":1360,"code":4687,"language":1362,"meta":419,"style":419},"# Remove the old hardcoded rule\nsudo iptables -t mangle -D FORWARD -p tcp --tcp-flags SYN,RST SYN -j TCPMSS --set-mss 1360\n\n# Add the smarter adaptive rule\nsudo iptables -t mangle -I FORWARD -p tcp --tcp-flags SYN,RST SYN -j TCPMSS --clamp-mss-to-pmtu\n\n# Install persistence\nsudo apt-get install -y iptables-persistent\nsudo netfilter-persistent save\n",[126,4689,4690,4695,4728,4732,4737,4767,4771,4776,4792],{"__ignoreMap":419},[1366,4691,4692],{"class":1368,"line":12},[1366,4693,4694],{"class":1371},"# Remove the old hardcoded rule\n",[1366,4696,4697,4699,4701,4703,4705,4708,4710,4712,4714,4716,4718,4720,4722,4724,4726],{"class":1368,"line":21},[1366,4698,4366],{"class":1377},[1366,4700,4369],{"class":1381},[1366,4702,4372],{"class":1407},[1366,4704,4375],{"class":1381},[1366,4706,4707],{"class":1407}," -D",[1366,4709,4381],{"class":1381},[1366,4711,4384],{"class":1407},[1366,4713,4387],{"class":1381},[1366,4715,4390],{"class":1407},[1366,4717,4393],{"class":1381},[1366,4719,4396],{"class":1381},[1366,4721,4399],{"class":1407},[1366,4723,4402],{"class":1381},[1366,4725,4405],{"class":1407},[1366,4727,4408],{"class":3439},[1366,4729,4730],{"class":1368,"line":30},[1366,4731,1895],{"emptyLinePlaceholder":436},[1366,4733,4734],{"class":1368,"line":39},[1366,4735,4736],{"class":1371},"# Add the smarter adaptive rule\n",[1366,4738,4739,4741,4743,4745,4747,4749,4751,4753,4755,4757,4759,4761,4763,4765],{"class":1368,"line":48},[1366,4740,4366],{"class":1377},[1366,4742,4369],{"class":1381},[1366,4744,4372],{"class":1407},[1366,4746,4375],{"class":1381},[1366,4748,4378],{"class":1407},[1366,4750,4381],{"class":1381},[1366,4752,4384],{"class":1407},[1366,4754,4387],{"class":1381},[1366,4756,4390],{"class":1407},[1366,4758,4393],{"class":1381},[1366,4760,4396],{"class":1381},[1366,4762,4399],{"class":1407},[1366,4764,4402],{"class":1381},[1366,4766,4477],{"class":1407},[1366,4768,4769],{"class":1368,"line":57},[1366,4770,1895],{"emptyLinePlaceholder":436},[1366,4772,4773],{"class":1368,"line":1536},[1366,4774,4775],{"class":1371},"# Install persistence\n",[1366,4777,4778,4780,4783,4786,4789],{"class":1368,"line":1557},[1366,4779,4366],{"class":1377},[1366,4781,4782],{"class":1381}," apt-get",[1366,4784,4785],{"class":1381}," install",[1366,4787,4788],{"class":1407}," -y",[1366,4790,4791],{"class":1381}," iptables-persistent\n",[1366,4793,4794,4796,4799],{"class":1368,"line":1577},[1366,4795,4366],{"class":1377},[1366,4797,4798],{"class":1381}," netfilter-persistent",[1366,4800,4801],{"class":1381}," save\n",[69,4803,4804,4807],{},[126,4805,4806],{},"iptables-persistent"," saves the current rules to disk and restores them automatically on every boot.",[538,4809],{},[73,4811,4813],{"id":4812},"making-it-automatic-the-asg-nodes","Making It Automatic: The ASG Nodes",[69,4815,4816],{},"The bigger concern was the Auto Scaling Group. Our OpenStack ASG spins up new worker nodes automatically when load increases. Each new node is an OpenStack VM and would have the same MTU mismatch out of the box. If a service happened to land on a new ASG node and made outbound HTTP calls, it would silently fail — and we might not notice until something like an email timeout surfaced it.",[69,4818,4819,4820,4823],{},"The fix belongs in the ",[126,4821,4822],{},"user_data"," cloud-init script that runs on every new node at boot. In our Heat template, right after the Swarm join:",[1330,4825,4827],{"className":1360,"code":4826,"language":1362,"meta":419,"style":419},"echo \"Joining swarm at ${MANAGER} advertising ${FLOATING_IP}...\"\ndocker swarm join --token \"$TOKEN\" \"$MANAGER\" \\\n  --advertise-addr \"$FLOATING_IP\" \\\n  --listen-addr 0.0.0.0:2377\necho \"Swarm join complete.\"\n\n# ── Fix MTU mismatch between Docker (1500) and OpenStack interface (1450) ──\necho \"--- Applying MTU fix ---\"\necho \"Host interface MTU before fix:\"\nip link show ens3 | grep mtu\n\necho \"Applying TCPMSS clamp rule...\"\niptables -t mangle -I FORWARD -p tcp --tcp-flags SYN,RST SYN -j TCPMSS --clamp-mss-to-pmtu\necho \"TCPMSS rule applied.\"\n\necho \"Verifying rule:\"\niptables -t mangle -L FORWARD -n -v --line-numbers\n\necho \"Installing iptables-persistent...\"\nDEBIAN_FRONTEND=noninteractive apt-get install -y iptables-persistent\necho \"iptables-persistent installed.\"\n\necho \"Saving rules...\"\nnetfilter-persistent save\necho \"Rules saved.\"\n\necho \"--- MTU fix complete ---\"\n",[126,4828,4829,4863,4887,4899,4905,4916,4920,4925,4936,4947,4969,4973,4984,5012,5023,5027,5038,5056,5060,5071,5089,5100,5104,5115,5122,5133,5137],{"__ignoreMap":419},[1366,4830,4831,4834,4836,4839,4842,4845,4848,4851,4853,4856,4858,4861],{"class":1368,"line":12},[1366,4832,4833],{"class":1455},"echo",[1366,4835,1505],{"class":1468},[1366,4837,4838],{"class":1381},"Joining swarm at ",[1366,4840,4841],{"class":1440},"${",[1366,4843,4844],{"class":1381},"MANAGER",[1366,4846,4847],{"class":1440},"}",[1366,4849,4850],{"class":1381}," advertising ",[1366,4852,4841],{"class":1440},[1366,4854,4855],{"class":1381},"FLOATING_IP",[1366,4857,4847],{"class":1440},[1366,4859,4860],{"class":1381},"...",[1366,4862,1574],{"class":1468},[1366,4864,4865,4867,4869,4871,4873,4875,4877,4879,4881,4883,4885],{"class":1368,"line":21},[1366,4866,4070],{"class":1377},[1366,4868,4245],{"class":1381},[1366,4870,4248],{"class":1381},[1366,4872,4251],{"class":1407},[1366,4874,1505],{"class":1468},[1366,4876,4256],{"class":1381},[1366,4878,1459],{"class":1468},[1366,4880,1505],{"class":1468},[1366,4882,4263],{"class":1381},[1366,4884,1459],{"class":1468},[1366,4886,4268],{"class":1407},[1366,4888,4889,4891,4893,4895,4897],{"class":1368,"line":30},[1366,4890,4273],{"class":1407},[1366,4892,1505],{"class":1468},[1366,4894,4278],{"class":1381},[1366,4896,1459],{"class":1468},[1366,4898,4268],{"class":1407},[1366,4900,4901,4903],{"class":1368,"line":39},[1366,4902,4287],{"class":1407},[1366,4904,4290],{"class":1381},[1366,4906,4907,4909,4911,4914],{"class":1368,"line":48},[1366,4908,4833],{"class":1455},[1366,4910,1505],{"class":1468},[1366,4912,4913],{"class":1381},"Swarm join complete.",[1366,4915,1574],{"class":1468},[1366,4917,4918],{"class":1368,"line":57},[1366,4919,1895],{"emptyLinePlaceholder":436},[1366,4921,4922],{"class":1368,"line":1536},[1366,4923,4924],{"class":1371},"# ── Fix MTU mismatch between Docker (1500) and OpenStack interface (1450) ──\n",[1366,4926,4927,4929,4931,4934],{"class":1368,"line":1557},[1366,4928,4833],{"class":1455},[1366,4930,1505],{"class":1468},[1366,4932,4933],{"class":1381},"--- Applying MTU fix ---",[1366,4935,1574],{"class":1468},[1366,4937,4938,4940,4942,4945],{"class":1368,"line":1577},[1366,4939,4833],{"class":1455},[1366,4941,1505],{"class":1468},[1366,4943,4944],{"class":1381},"Host interface MTU before fix:",[1366,4946,1574],{"class":1468},[1366,4948,4949,4952,4955,4958,4961,4963,4966],{"class":1368,"line":1583},[1366,4950,4951],{"class":1377},"ip",[1366,4953,4954],{"class":1381}," link",[1366,4956,4957],{"class":1381}," show",[1366,4959,4960],{"class":1381}," ens3",[1366,4962,1401],{"class":1385},[1366,4964,4965],{"class":1377}," grep",[1366,4967,4968],{"class":1381}," mtu\n",[1366,4970,4971],{"class":1368,"line":1589},[1366,4972,1895],{"emptyLinePlaceholder":436},[1366,4974,4975,4977,4979,4982],{"class":1368,"line":1968},[1366,4976,4833],{"class":1455},[1366,4978,1505],{"class":1468},[1366,4980,4981],{"class":1381},"Applying TCPMSS clamp rule...",[1366,4983,1574],{"class":1468},[1366,4985,4986,4988,4990,4992,4994,4996,4998,5000,5002,5004,5006,5008,5010],{"class":1368,"line":1980},[1366,4987,4452],{"class":1377},[1366,4989,4372],{"class":1407},[1366,4991,4375],{"class":1381},[1366,4993,4378],{"class":1407},[1366,4995,4381],{"class":1381},[1366,4997,4384],{"class":1407},[1366,4999,4387],{"class":1381},[1366,5001,4390],{"class":1407},[1366,5003,4393],{"class":1381},[1366,5005,4396],{"class":1381},[1366,5007,4399],{"class":1407},[1366,5009,4402],{"class":1381},[1366,5011,4477],{"class":1407},[1366,5013,5014,5016,5018,5021],{"class":1368,"line":1986},[1366,5015,4833],{"class":1455},[1366,5017,1505],{"class":1468},[1366,5019,5020],{"class":1381},"TCPMSS rule applied.",[1366,5022,1574],{"class":1468},[1366,5024,5025],{"class":1368,"line":1992},[1366,5026,1895],{"emptyLinePlaceholder":436},[1366,5028,5029,5031,5033,5036],{"class":1368,"line":1997},[1366,5030,4833],{"class":1455},[1366,5032,1505],{"class":1468},[1366,5034,5035],{"class":1381},"Verifying rule:",[1366,5037,1574],{"class":1468},[1366,5039,5040,5042,5044,5046,5048,5050,5052,5054],{"class":1368,"line":2009},[1366,5041,4452],{"class":1377},[1366,5043,4372],{"class":1407},[1366,5045,4375],{"class":1381},[1366,5047,4551],{"class":1407},[1366,5049,4381],{"class":1381},[1366,5051,4556],{"class":1407},[1366,5053,4559],{"class":1407},[1366,5055,4562],{"class":1407},[1366,5057,5058],{"class":1368,"line":2018},[1366,5059,1895],{"emptyLinePlaceholder":436},[1366,5061,5062,5064,5066,5069],{"class":1368,"line":2024},[1366,5063,4833],{"class":1455},[1366,5065,1505],{"class":1468},[1366,5067,5068],{"class":1381},"Installing iptables-persistent...",[1366,5070,1574],{"class":1468},[1366,5072,5073,5076,5078,5081,5083,5085,5087],{"class":1368,"line":2034},[1366,5074,5075],{"class":3469},"DEBIAN_FRONTEND",[1366,5077,3473],{"class":1440},[1366,5079,5080],{"class":1381},"noninteractive",[1366,5082,4782],{"class":1377},[1366,5084,4785],{"class":1381},[1366,5086,4788],{"class":1407},[1366,5088,4791],{"class":1381},[1366,5090,5091,5093,5095,5098],{"class":1368,"line":2041},[1366,5092,4833],{"class":1455},[1366,5094,1505],{"class":1468},[1366,5096,5097],{"class":1381},"iptables-persistent installed.",[1366,5099,1574],{"class":1468},[1366,5101,5102],{"class":1368,"line":2051},[1366,5103,1895],{"emptyLinePlaceholder":436},[1366,5105,5106,5108,5110,5113],{"class":1368,"line":2058},[1366,5107,4833],{"class":1455},[1366,5109,1505],{"class":1468},[1366,5111,5112],{"class":1381},"Saving rules...",[1366,5114,1574],{"class":1468},[1366,5116,5117,5120],{"class":1368,"line":2072},[1366,5118,5119],{"class":1377},"netfilter-persistent",[1366,5121,4801],{"class":1381},[1366,5123,5124,5126,5128,5131],{"class":1368,"line":2081},[1366,5125,4833],{"class":1455},[1366,5127,1505],{"class":1468},[1366,5129,5130],{"class":1381},"Rules saved.",[1366,5132,1574],{"class":1468},[1366,5134,5135],{"class":1368,"line":2087},[1366,5136,1895],{"emptyLinePlaceholder":436},[1366,5138,5140,5142,5144,5147],{"class":1368,"line":5139},27,[1366,5141,4833],{"class":1455},[1366,5143,1505],{"class":1468},[1366,5145,5146],{"class":1381},"--- MTU fix complete ---",[1366,5148,1574],{"class":1468},[69,5150,1682,5151,5154,5155,5158],{},[126,5152,5153],{},"DEBIAN_FRONTEND=noninteractive"," flag is important. Without it, ",[126,5156,5157],{},"apt-get install iptables-persistent"," will pause and wait for interactive input asking whether to save current IPv4 and IPv6 rules — something that cannot happen in an automated script. The flag suppresses all prompts.",[69,5160,5161,5162,5165],{},"Every new ASG node now gets the fix automatically at boot, and the log at ",[126,5163,5164],{},"\u002Fvar\u002Flog\u002Fswarm-setup.log"," will contain a full trace of the MTU fix running, so you can verify it without SSHing into the node.",[538,5167],{},[73,5169,5171],{"id":5170},"what-we-did-not-need-to-do","What We Did Not Need to Do",[69,5173,5174,5175,5178],{},"It is worth being explicit about this. The following changes that are commonly suggested for MTU problems in Docker Swarm were ",[81,5176,5177],{},"not needed"," for our specific situation:",[91,5180,5181,5187,5190,5193,5199,5202],{},[94,5182,5183,5184,5186],{},"❌ Changing ",[126,5185,4500],{}," MTU",[94,5188,5189],{},"❌ Recreating overlay networks",[94,5191,5192],{},"❌ Recreating the ingress network",[94,5194,5195,5196],{},"❌ Changing host interface MTU with ",[126,5197,5198],{},"ip link set",[94,5200,5201],{},"❌ Draining any nodes",[94,5203,5204],{},"❌ Any service restarts",[69,5206,5207],{},"The reason is that our problem was not in the Docker overlay between nodes. It was in outbound TCP from containers on OpenStack nodes going through a double-NAT path. The TCPMSS clamp fixed it at exactly the right layer.",[538,5209],{},[73,5211,5213],{"id":5212},"lessons-learned","Lessons Learned",[69,5215,5216,5219],{},[81,5217,5218],{},"1. Trace the actual traffic path before deciding where to fix.","\nMTU problems in hybrid environments are rarely a single-layer issue. Our traffic was going: container → Docker gateway bridge → OpenStack interface → external network → HAProxy → Traefik → back into OpenStack. Understanding that path was what led us to the right fix.",[69,5221,5222,5225],{},[81,5223,5224],{},"2. Asymmetry in failures is a signal, not noise.","\nThe fact that the monitoring node worked but the test node did not was the most important clue. Same destination, same service, different result. That asymmetry pointed directly at the source node's network path being different — which led us to the floating IP and double-NAT discovery.",[69,5227,5228,5231],{},[81,5229,5230],{},"3. The standard fix is not always the right fix.","\nThe Docker daemon MTU approach is correct for overlay network MTU mismatches. But applying it blindly would have caused unnecessary downtime and not addressed the root cause.",[69,5233,5234,5237],{},[81,5235,5236],{},"4. Bake infrastructure fixes into provisioning, not just running nodes.","\nFixing the running node is only half the job. If your ASG spins up ten new nodes tomorrow and they all have the same problem, you will be chasing the same fire. The fix belongs in the provisioning script.",[69,5239,5240,5243],{},[81,5241,5242],{},"5. Silent drops are the hardest bugs.","\nNo error. No ICMP response. No log entry on the receiving side. Just a timeout on the sender. These are the bugs that can send you chasing application code, DNS, TLS, or service configuration for hours before you think to check MTU.",[538,5245],{},[73,5247,5249],{"id":5248},"summary","Summary",[873,5251,5252,5262],{},[876,5253,5254],{},[879,5255,5256,5259],{},[882,5257,5258],{},"What we thought the problem was",[882,5260,5261],{},"Docker overlay MTU mismatch",[889,5263,5264,5272,5280,5288,5296,5307],{},[879,5265,5266,5269],{},[894,5267,5268],{},"What the problem actually was",[894,5270,5271],{},"Docker (MTU 1500) vs OpenStack interface (MTU 1450) mismatch on outbound TCP from OpenStack nodes",[879,5273,5274,5277],{},[894,5275,5276],{},"Why it only affected OpenStack nodes",[894,5278,5279],{},"Outside nodes have real 1500 MTU interfaces with no mismatch",[879,5281,5282,5285],{},[894,5283,5284],{},"Why monitoring worked but test node failed",[894,5286,5287],{},"Monitoring node is outside OpenStack, only crosses the boundary once",[879,5289,5290,5293],{},[894,5291,5292],{},"The fix",[894,5294,5295],{},"TCPMSS iptables clamp on each OpenStack node",[879,5297,5298,5301],{},[894,5299,5300],{},"Where the fix lives",[894,5302,5303,5304,5306],{},"Manually on existing nodes, baked into Heat ",[126,5305,4822],{}," for ASG nodes",[879,5308,5309,5312],{},[894,5310,5311],{},"What we avoided",[894,5313,5314],{},"Any Docker network changes, downtime, service restarts",[69,5316,5317],{},"The infrastructure is hybrid by design and will stay that way until OpenStack proves itself reliable enough to trust fully. In the meantime, understanding exactly how packets move through a mixed environment — and where they can silently disappear — is what keeps things running.",[2304,5319,5320],{},"html pre.shiki code .sYZai, html code.shiki .sYZai{--shiki-default:#999999}html pre.shiki code .s61at, html code.shiki .s61at{--shiki-default:#99841877}html pre.shiki code .su6XF, html code.shiki .su6XF{--shiki-default:#998418}html pre.shiki code .s-TwI, html code.shiki .s-TwI{--shiki-default:#2F798A}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sySUi, html code.shiki .sySUi{--shiki-default:#59873A}html pre.shiki code .spphp, html code.shiki .spphp{--shiki-default:#B56959}html pre.shiki code .sEi1f, html code.shiki .sEi1f{--shiki-default:#A65E2B}html pre.shiki code .sSP4y, html code.shiki .sSP4y{--shiki-default:#B5695977}html pre.shiki code .suHK_, html code.shiki .suHK_{--shiki-default:#393A34}html pre.shiki code .s8zF2, html code.shiki .s8zF2{--shiki-default:#A0ADA0}html pre.shiki code .si04Y, html code.shiki .si04Y{--shiki-default:#AB5959}html pre.shiki code .svycV, html code.shiki .svycV{--shiki-default:#B07D48}",{"title":419,"searchDepth":21,"depth":21,"links":5322},[5323,5324,5325,5326,5327,5328,5329,5330,5331,5334,5335,5336,5337,5338,5339],{"id":3896,"depth":21,"text":3897},{"id":3936,"depth":21,"text":3937},{"id":3951,"depth":21,"text":3952},{"id":4044,"depth":21,"text":4045},{"id":4056,"depth":21,"text":4057},{"id":4160,"depth":21,"text":4161},{"id":4229,"depth":21,"text":4230},{"id":4320,"depth":21,"text":4321},{"id":4352,"depth":21,"text":4353,"children":5332},[5333],{"id":4414,"depth":30,"text":4415},{"id":4488,"depth":21,"text":4489},{"id":4680,"depth":21,"text":4681},{"id":4812,"depth":21,"text":4813},{"id":5170,"depth":21,"text":5171},{"id":5212,"depth":21,"text":5213},{"id":5248,"depth":21,"text":5249},"2026-05-14","A deep dive into networking, resource limits, and automated scaling strategies for Docker Swarm on OpenStack.",{},"\u002Fblog\u002Fmtu-troubleshooting","5 min",{"title":3884,"description":5341},"blog\u002Fmtu-troubleshooting\u002Findex",[5348,5349,2329,5350],"Docker","OpenStack","Infrastructure","\u002Fimages\u002Fthumbnails\u002Fmtu-troubleshooting-blog.png","63-y55S9q1R7XroaCNcQjwFJE9Gr4Gw_65fkf2fP3aI",{"id":5354,"title":5355,"body":5356,"category":5461,"date":5462,"description":5463,"extension":434,"meta":5464,"navigation":436,"path":5465,"readTime":5466,"seo":5467,"stem":5468,"tags":5469,"thumbnail":5472,"__hash__":5473},"blog\u002Fblog\u002Fwelcome-digital-sandbox\u002Findex.md","🚀 Welcome to My Digital Sandbox!",{"type":66,"value":5357,"toc":5457},[5358,5366,5373,5377,5384,5387,5390,5397,5403,5406,5410,5413,5445,5451],[69,5359,5360,5361,5365],{},"My great friend, ",[5362,5363],"mention",{"name":5364},"Joe"," thinks I’m doing some amazing stuff and we discuss these things over call on our routine catch ups. I felt like writing again, after the very long hiatus from my blog in 2018.\nSo yeah I am going to start writing again!",[69,5367,5368],{},[5369,5370],"img",{"alt":5371,"src":5372},"Geeking Out over Tech","https:\u002F\u002Fmedia.giphy.com\u002Fmedia\u002Fv1.Y2lkPTc5MGI3NjExZWliZ2l2aWZxNmJhZGh5a3JoZjF4M2dvaDVrZWdjOWxlYXlneWM3NCZlcD12MV9naWZzX3NlYXJjaCZjdD1n\u002F0ixAZaU8Gp8R5TdRQT\u002Fgiphy.gif",[86,5374,5376],{"id":5375},"the-ai-superpower-the-80-trap","⚡ The AI Superpower & The \"80% Trap\"",[69,5378,5379,5380,5383],{},"Fortunately for me—and I mean ",[1673,5381,5382],{},"fortunately","—AI has completely changed the game.",[69,5385,5386],{},"I use AI to move at breakneck speed. I can spin up prototypes, test out wild ideas, and ditch them if they don't work faster than it used to take just to configure a boilerplate setup.",[69,5388,5389],{},"⚡ IDEAS -> PROTOTYPE -> NEXT BIG THING",[69,5391,5392,5393,5396],{},"I'm the kind of engineer who always needs something active to chew on. Like many creatives, I suffer from ",[81,5394,5395],{},"\"Shiny Object Syndrome\"","—you know, that classic trap where you hit 80% completion, the core structural problems are solved, you lose interest, and your brain screams for a new challenge. Because of that, I am constantly hunting for new ideas and fresh projects.",[69,5398,5399],{},[5369,5400],{"alt":5401,"src":5402},"Moving Fast and Breaking Things","https:\u002F\u002Fmedia.giphy.com\u002Fmedia\u002Fv1.Y2lkPTc5MGI3NjExM3E4OXEweXhudW16bGszcHZnd3VnbnlkZHNldjJyYTZ0Y3N6cDE2dyZlcD12MV9naWZzX3NlYXJjaCZjdD1n\u002Fr2BtghAUTmpP2\u002Fgiphy.gif",[69,5404,5405],{},"So, back to the point: I am going to spend some time writing about all the fun, chaotic things I encounter as I dive into the uncomfortable—breaking things, fixing them, and learning on the fly. I already have a few of these write-ups sitting in my Notion, so I’ll be migrating them over here soon.",[73,5407,5409],{"id":5408},"️-what-can-you-expect-here","🗺️ What Can You Expect Here?",[69,5411,5412],{},"Here is a roadmap of the chaos and insights I'll be dumping into this space:",[91,5414,5415,5421,5427,5433,5439],{},[94,5416,5417,5420],{},[81,5418,5419],{},"🏢 On-Premise Infrastructure:"," I want to keep my bare-metal and self-hosted knowledge alive and kicking, so expect deep dives into hardware and infrastructure management.",[94,5422,5423,5426],{},[81,5424,5425],{},"🔷 C# Mastery:"," I write C# daily. Whenever I find a clever optimization at work (that I can freely share), or stumble on something fun, it's going right here.",[94,5428,5429,5432],{},[81,5430,5431],{},"🐳 Docker, Swarm, & Linux:"," I’ve spent a lot of time breaking and fixing things in cluster environments. I've got a lot of hard-learned lessons to share.",[94,5434,5435,5438],{},[81,5436,5437],{},"🤖 AI & RAG:"," Ha! I have an active Retrieval-Augmented Generation (RAG) project doing some incredibly cool things. I’ll be breaking down how it works and how I leverage AI daily.",[94,5440,5441,5444],{},[81,5442,5443],{},"🌱 Life & Daily Learnings:"," General thoughts, philosophical brain dumps, and a running log of the new things I learn every single day.",[69,5446,5447],{},[5369,5448],{"alt":5449,"src":5450},"Let's Build","https:\u002F\u002Fmedia.giphy.com\u002Fmedia\u002Fv1.Y2lkPTc5MGI3NjExbmswM3g4amw0NndpZ3V5cm15NmFyeXN6bTZtc3Z6bndvbmNndXFhNCZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw\u002FYl5aO3gdVfsQ0\u002Fgiphy.gif",[69,5452,5453,5456],{},[1673,5454,5455],{},"Stay tuned. We're going to break some things!"," 🔥",{"title":419,"searchDepth":21,"depth":21,"links":5458},[5459,5460],{"id":5375,"depth":30,"text":5376},{"id":5408,"depth":21,"text":5409},"Thoughts","2025-02-27","Joe thinks I’m doing some amazing stuff and honestly, he wanted a front-row seat to read about it.",{},"\u002Fblog\u002Fwelcome-digital-sandbox","2 min",{"title":5355,"description":5463},"blog\u002Fwelcome-digital-sandbox\u002Findex",[5470,5471,517,5364],"Intro","AI","\u002Fimages\u002Fthumbnails\u002Favatar.png","vNCKmZK-esP9P8yFaHtk87tlNS4TjT7bYcH2gCuFPHs",1780657374328]