<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
  <channel>
    <title>InfoQ - PDF</title>
    <link>https://www.infoq.com</link>
    <description>InfoQ PDF feed</description>
    <item>
      <title>Hugging Face Releases FinePDFs: a 3-Trillion-Token Dataset Built from PDFs</title>
      <link>https://www.infoq.com/news/2025/09/finepdfs/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=PDF</link>
      <description>&lt;img src="https://res.infoq.com/news/2025/09/finepdfs/en/headerimage/generatedHeaderImage-1757926150946.jpg"/&gt;&lt;p&gt;Hugging Face has unveiled FinePDFs, the largest publicly available corpus built entirely from PDFs. The dataset spans 475 million documents in 1,733 languages, totaling roughly 3 trillion tokens. At 3.65 terabytes in size, FinePDFs introduces a new dimension to open training datasets by tapping into a resource long considered too complex and expensive to process.&lt;/p&gt; &lt;i&gt;By Robert Krzaczyński&lt;/i&gt;</description>
      <category>PDF</category>
      <category>Artificial Intelligence</category>
      <category>Hugging Face</category>
      <category>Large language models</category>
      <category>AI, ML &amp; Data Engineering</category>
      <category>news</category>
      <pubDate>Mon, 15 Sep 2025 08:55:00 GMT</pubDate>
      <guid>https://www.infoq.com/news/2025/09/finepdfs/?utm_campaign=infoq_content&amp;utm_source=infoq&amp;utm_medium=feed&amp;utm_term=PDF</guid>
      <dc:creator>Robert Krzaczyński</dc:creator>
      <dc:date>2025-09-15T08:55:00Z</dc:date>
      <dc:identifier>/news/2025/09/finepdfs/en</dc:identifier>
    </item>
  </channel>
</rss>
