<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
  <channel>
    <title>InfoQ - Cloud - Articles</title>
    <link>https://www.infoq.com</link>
    <description>InfoQ Cloud Articles feed</description>
    <item>
      <title>Article: Two Misconfigurations That Caused Spark OOM Failures on Kubernetes</title>
      <link>https://www.infoq.com/articles/spark-oom-kubernetes-misconfigurations/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=Cloud-articles</link>
      <description>&lt;img src="https://res.infoq.com/articles/spark-oom-kubernetes-misconfigurations/en/headerimage/spark-oom-kubernetes-misconfigurations-header-1780044756757.jpg"/&gt;&lt;p&gt;After migrating Spark pipelines to Azure Kubernetes Service, two infrastructure settings interacted destructively: spark.kubernetes.local.dirs.tmpfs=true backed shuffle spill with RAM instead of disk, and a hard podAffinity rule forced all executors onto one node. Together, they caused repeated OOM kills invisible to standard diagnostics.&lt;/p&gt; &lt;i&gt;By Pranav Bhasker&lt;/i&gt;</description>
      <category>Cloud</category>
      <category>Apache Spark</category>
      <category>Kubernetes</category>
      <category>AI, ML &amp; Data Engineering</category>
      <category>DevOps</category>
      <category>Development</category>
      <category>Architecture &amp; Design</category>
      <category>article</category>
      <pubDate>Wed, 03 Jun 2026 09:00:00 GMT</pubDate>
      <guid>https://www.infoq.com/articles/spark-oom-kubernetes-misconfigurations/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=Cloud-articles</guid>
      <dc:creator>Pranav Bhasker</dc:creator>
      <dc:date>2026-06-03T09:00:00Z</dc:date>
      <dc:identifier>/articles/spark-oom-kubernetes-misconfigurations/en</dc:identifier>
    </item>
    <item>
      <title>Article: Stragglers, Not Failures: How Adaptive Hedged Requests Reduce p99 Latency by 74 Percent</title>
      <link>https://www.infoq.com/articles/adaptive-hedged-requests-p99-latency/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=Cloud-articles</link>
      <description>&lt;img src="https://res.infoq.com/articles/adaptive-hedged-requests-p99-latency/en/headerimage/adaptive-hedged-requests-p99-latency-header-1779785816730.jpg"/&gt;&lt;p&gt;In fan-out microservice architectures, slow-but-completing requests accumulate across services and drive p99 latency far higher than per-service metrics suggest. This article presents an adaptive hedging mechanism that uses DDSketch for real-time quantile estimation, windowed rotation to handle distribution drift, and a token-bucket budget to prevent load amplification.&lt;/p&gt; &lt;i&gt;By Prathamesh Bhope&lt;/i&gt;</description>
      <category>Cloud</category>
      <category>Architecture</category>
      <category>Distributed Systems</category>
      <category>Performance</category>
      <category>DevOps</category>
      <category>Development</category>
      <category>Architecture &amp; Design</category>
      <category>article</category>
      <pubDate>Thu, 28 May 2026 09:00:00 GMT</pubDate>
      <guid>https://www.infoq.com/articles/adaptive-hedged-requests-p99-latency/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=Cloud-articles</guid>
      <dc:creator>Prathamesh Bhope</dc:creator>
      <dc:date>2026-05-28T09:00:00Z</dc:date>
      <dc:identifier>/articles/adaptive-hedged-requests-p99-latency/en</dc:identifier>
    </item>
  </channel>
</rss>
