<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
  <channel>
    <title>InfoQ - Site Reliability Engineering</title>
    <link>https://www.infoq.com</link>
    <description>InfoQ Site Reliability Engineering feed</description>
    <item>
      <title>Article: Configuration as a Control Plane: Designing for Safety and Reliability at Scale</title>
      <link>https://www.infoq.com/articles/configuration-control-plane/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=Site+Reliability+Engineering</link>
      <description>&lt;img src="https://res.infoq.com/articles/configuration-control-plane/en/headerimage/configuration-as-a-control-plane-designing-for-safety-and-reliability-at-scale-header-1773657574566.jpg"/&gt;&lt;p&gt;Configuration has evolved from static deployment files into a live control plane that directly shapes system behavior. The evolution of configuration management highlights why misconfigurations can trigger large outages and how hyperscalers deploy changes safely using staged rollouts, validation, blast radius limits, and automated rollback at scale.&lt;/p&gt; &lt;i&gt;By Karthiek Maralla&lt;/i&gt;</description>
      <category>Site Reliability Engineering</category>
      <category>Cloud Architecture</category>
      <category>Configuration Management</category>
      <category>Policy as Code</category>
      <category>Distributed Systems</category>
      <category>GitOps</category>
      <category>Platform Engineering</category>
      <category>Development</category>
      <category>Architecture &amp; Design</category>
      <category>DevOps</category>
      <category>article</category>
      <pubDate>Fri, 20 Mar 2026 09:00:00 GMT</pubDate>
      <guid>https://www.infoq.com/articles/configuration-control-plane/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=Site+Reliability+Engineering</guid>
      <dc:creator>Karthiek Maralla</dc:creator>
      <dc:date>2026-03-20T09:00:00Z</dc:date>
      <dc:identifier>/articles/configuration-control-plane/en</dc:identifier>
    </item>
    <item>
      <title>QCon London 2026: Wrangling Telemetry at Scale, a Guide to Self-Hosted Observability</title>
      <link>https://www.infoq.com/news/2026/03/self-hosted-observability/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=Site+Reliability+Engineering</link>
      <description>&lt;img src="https://res.infoq.com/news/2026/03/self-hosted-observability/en/headerimage/generatedHeaderImage-1773862581289.jpg"/&gt;&lt;p&gt;At QCon London 2026, Colin Douch discussed building and operating self-hosted monitoring stacks, surveyed the current tooling landscape, and explained how to build a coherent observability setup rather than treating logs, metrics, and traces as separate pillars.&lt;/p&gt; &lt;i&gt;By Renato Losio&lt;/i&gt;</description>
      <category>OpenTelemetry</category>
      <category>Site Reliability Engineering</category>
      <category>Telemetry</category>
      <category>Distributed Tracing</category>
      <category>Metrics</category>
      <category>Logging</category>
      <category>Prometheus</category>
      <category>Observability</category>
      <category>Development</category>
      <category>DevOps</category>
      <category>news</category>
      <pubDate>Thu, 19 Mar 2026 09:17:00 GMT</pubDate>
      <guid>https://www.infoq.com/news/2026/03/self-hosted-observability/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=Site+Reliability+Engineering</guid>
      <dc:creator>Renato Losio</dc:creator>
      <dc:date>2026-03-19T09:17:00Z</dc:date>
      <dc:identifier>/news/2026/03/self-hosted-observability/en</dc:identifier>
    </item>
    <item>
      <title>War in Iran Damages Multiple AWS Data Centers, Challenging Multi-AZ Assumptions</title>
      <link>https://www.infoq.com/news/2026/03/aws-multiaz-conflict-outage/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=Site+Reliability+Engineering</link>
      <description>&lt;img src="https://res.infoq.com/news/2026/03/aws-multiaz-conflict-outage/en/headerimage/generatedHeaderImage-1773671194784.jpg"/&gt;&lt;p&gt;Earlier this month, Iranian drone strikes damaged three AWS data centers in the UAE and Bahrain, causing outages and disruptions to multiple services. The events, which affected multiple facilities within the same AWS region, sparked discussion in the community about how geopolitical conflict can directly impact global cloud infrastructure and multi-AZ deployments.&lt;/p&gt; &lt;i&gt;By Renato Losio&lt;/i&gt;</description>
      <category>Site Reliability Engineering</category>
      <category>Architecture</category>
      <category>multi-region</category>
      <category>AWS</category>
      <category>Cloud</category>
      <category>Failure</category>
      <category>Architecture &amp; Design</category>
      <category>DevOps</category>
      <category>news</category>
      <pubDate>Wed, 18 Mar 2026 08:17:00 GMT</pubDate>
      <guid>https://www.infoq.com/news/2026/03/aws-multiaz-conflict-outage/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=Site+Reliability+Engineering</guid>
      <dc:creator>Renato Losio</dc:creator>
      <dc:date>2026-03-18T08:17:00Z</dc:date>
      <dc:identifier>/news/2026/03/aws-multiaz-conflict-outage/en</dc:identifier>
    </item>
  </channel>
</rss>
