<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:content="http://purl.org/rss/1.0/modules/content/">
    <channel>
        <title>Extious’ Notes</title>
        <link>https://notion.zhaozhan.site/</link>
        <description>Extious 的个人笔记</description>
        <lastBuildDate>Wed, 06 Aug 2025 02:17:41 GMT</lastBuildDate>
        <docs>https://validator.w3.org/feed/docs/rss2.html</docs>
        <generator>https://github.com/jpmonette/feed</generator>
        <language>zh-CN</language>
        <copyright>All rights reserved 2025, Extious</copyright>
        <item>
            <title><![CDATA[Computer Network]]></title>
            <link>https://notion.zhaozhan.site/article/12ac030b-24dd-80cd-ab62-cb98b6c65978</link>
            <guid>https://notion.zhaozhan.site/article/12ac030b-24dd-80cd-ab62-cb98b6c65978</guid>
            <pubDate>Fri, 25 Oct 2024 00:00:00 GMT</pubDate>
            <content:encoded><![CDATA[<div id="notion-article" class="mx-auto overflow-hidden "><main class="notion light-mode notion-page notion-block-12ac030b24dd80cdab62cb98b6c65978"><div class="notion-viewport"></div><a class="notion-page-link notion-block-12ac030b24dd807e91f0eb3891011ce0" href="/12ac030b24dd807e91f0eb3891011ce0"><span class="notion-page-title"><div class="notion-page-icon-inline notion-page-icon-image"><svg class="notion-page-title-icon notion-page-icon" alt="WebSocket" viewBox="0 0 30 30" width="16"><path d="M16,1H4v28h22V11L16,1z M16,3.828L23.172,11H16V3.828z M24,27H6V3h8v10h10V27z M8,17h14v-2H8V17z M8,21h14v-2H8V21z M8,25h14v-2H8V25z"></path></svg></div><span class="notion-page-title-text">WebSocket</span></span></a><div class="notion-blank notion-block-245c030b24dd80f0a086ecacf08fb76e"> </div></main></div>]]></content:encoded>
        </item>
        <item>
            <title><![CDATA[NCCL通信]]></title>
            <link>https://notion.zhaozhan.site/article/145c030b-24dd-8028-a995-ea92c4c5f7b1</link>
            <guid>https://notion.zhaozhan.site/article/145c030b-24dd-8028-a995-ea92c4c5f7b1</guid>
            <pubDate>Thu, 21 Nov 2024 00:00:00 GMT</pubDate>
            <content:encoded><![CDATA[<div id="notion-article" class="mx-auto overflow-hidden "><main class="notion light-mode notion-page notion-block-145c030b24dd8028a995ea92c4c5f7b1"><div class="notion-viewport"></div><h2 class="notion-h notion-h1 notion-h-indent-0 notion-block-146c030b24dd805dba29defa9e39a319" data-id="146c030b24dd805dba29defa9e39a319"><span><div id="146c030b24dd805dba29defa9e39a319" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd805dba29defa9e39a319" title="集体通信原语"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">集体通信原语</span></span></h2><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-146c030b24dd80f983e8c7ef19c1fd28" data-id="146c030b24dd80f983e8c7ef19c1fd28"><span><div id="146c030b24dd80f983e8c7ef19c1fd28" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd80f983e8c7ef19c1fd28" title="原语介绍"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">原语介绍</span></span></h3><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-146c030b24dd802699d2ca76ba355afb" data-id="146c030b24dd802699d2ca76ba355afb"><span><div id="146c030b24dd802699d2ca76ba355afb" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd802699d2ca76ba355afb" title="Broadcast"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">Broadcast</span></span></h4><figure class="notion-asset-wrapper notion-asset-wrapper-image notion-block-146c030b24dd805fbdf9cce099314d07"><div style="position:relative;display:flex;justify-content:center;align-self:center;width:650px;max-width:100%;flex-direction:column"><img style="object-fit:cover" src="https://www.notion.so/image/https%3A%2F%2Fprod-files-secure.s3.us-west-2.amazonaws.com%2F7599b067-d45c-4f81-9480-91921f5c996e%2F615d1268-2d74-4fb3-aa2f-33c43d60ce4e%2Fimage.png?table=block&amp;id=146c030b-24dd-805f-bdf9-cce099314d07&amp;t=146c030b-24dd-805f-bdf9-cce099314d07&amp;width=650&amp;cache=v2" alt="notion image" loading="lazy" decoding="async"/></div></figure><div class="notion-text notion-block-146c030b24dd80db9abdde29d76a6d20">初始：只有GPU0上有数据</div><div class="notion-text notion-block-146c030b24dd80308c87e29f96742486">GPU0将DATA广播到GPU0, GPU1, GPU2, GPU3.</div><div class="notion-text notion-block-146c030b24dd803fa0a2efff79a3133f">最终：每个GPU数据一样</div><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-146c030b24dd80b8912df27744487f56" data-id="146c030b24dd80b8912df27744487f56"><span><div id="146c030b24dd80b8912df27744487f56" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd80b8912df27744487f56" title="Scatter"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">Scatter</span></span></h4><div class="notion-text notion-block-146c030b24dd80eabb6bf3ba0ea5982d">初始：只有GPU0上有数据</div><div class="notion-text notion-block-146c030b24dd8053bc0ffc7362f5de3d">GPU0将DATA分片再分发给GPU0, GPU1, GPU2, GPU3.</div><div class="notion-text notion-block-146c030b24dd8007a94ce2d6dd6a422f">最终：每个GPU数据不一样</div><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-146c030b24dd80f58f34dc25d29eabf9" data-id="146c030b24dd80f58f34dc25d29eabf9"><span><div id="146c030b24dd80f58f34dc25d29eabf9" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd80f58f34dc25d29eabf9" title="Gather"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">Gather</span></span></h4><div class="notion-text notion-block-146c030b24dd80c0a1f0c46a0196d445">Scatter的反向操作</div><div class="notion-text notion-block-146c030b24dd80a28e61d4e1e1489bf1">初始：每个GPU上有不同的数据</div><div class="notion-text notion-block-146c030b24dd801c82c7dc0cb8a53579">GPU0将GPU0, GPU1, GPU2, GPU3.的数据收回</div><div class="notion-text notion-block-146c030b24dd80398c3cfbaa2b9b815c">最终：GPU0有完整的数据</div><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-146c030b24dd80f6921af5889d5722a3" data-id="146c030b24dd80f6921af5889d5722a3"><span><div id="146c030b24dd80f6921af5889d5722a3" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd80f6921af5889d5722a3" title="All-Gather"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">All-Gather</span></span></h4><figure class="notion-asset-wrapper notion-asset-wrapper-image notion-block-146c030b24dd80c5b784f253eb5e5ef4"><div style="position:relative;display:flex;justify-content:center;align-self:center;width:650px;max-width:100%;flex-direction:column"><img style="object-fit:cover" src="https://www.notion.so/image/https%3A%2F%2Fprod-files-secure.s3.us-west-2.amazonaws.com%2F7599b067-d45c-4f81-9480-91921f5c996e%2Ffad7c0c5-a8b6-42d6-aac9-4c6fe489d2c1%2Fimage.png?table=block&amp;id=146c030b-24dd-80c5-b784-f253eb5e5ef4&amp;t=146c030b-24dd-80c5-b784-f253eb5e5ef4&amp;width=650&amp;cache=v2" alt="notion image" loading="lazy" decoding="async"/></div></figure><div class="notion-text notion-block-146c030b24dd80c39993db32fe3392a5">Gather + Broadcast</div><div class="notion-text notion-block-146c030b24dd80c7881ce83305fc08d0">初始：每个GPU上有不同的数据</div><div class="notion-text notion-block-146c030b24dd80b1ab6bc3db7308e1c0">最终：每个GPU有完整的数据</div><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-146c030b24dd80d9b101e21f13dc0b99" data-id="146c030b24dd80d9b101e21f13dc0b99"><span><div id="146c030b24dd80d9b101e21f13dc0b99" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd80d9b101e21f13dc0b99" title="Reduce"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">Reduce</span></span></h4><figure class="notion-asset-wrapper notion-asset-wrapper-image notion-block-146c030b24dd80bd8219fc296bbefb18"><div style="position:relative;display:flex;justify-content:center;align-self:center;width:650px;max-width:100%;flex-direction:column"><img style="object-fit:cover" src="https://www.notion.so/image/https%3A%2F%2Fprod-files-secure.s3.us-west-2.amazonaws.com%2F7599b067-d45c-4f81-9480-91921f5c996e%2F095dc29e-f34f-47b5-b896-cd85dc646f6d%2Fimage.png?table=block&amp;id=146c030b-24dd-80bd-8219-fc296bbefb18&amp;t=146c030b-24dd-80bd-8219-fc296bbefb18&amp;width=650&amp;cache=v2" alt="notion image" loading="lazy" decoding="async"/></div></figure><div class="notion-text notion-block-146c030b24dd8085b792deb1f33e2035">初始：每个GPU上有不同的数据</div><div class="notion-text notion-block-146c030b24dd809c817ff8bfd9f1fc1c">将所有GPU的数据进行规约（求和，求积，矩阵运算等不改变维度）计算，将结果归于主节点GPU0（或其他GPU）</div><div class="notion-text notion-block-146c030b24dd800d84f2d51359a16e3e">最终：GPU0（或其他单个GPU）有规约后的结果</div><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-146c030b24dd804ebae1d9cc0e24b6f3" data-id="146c030b24dd804ebae1d9cc0e24b6f3"><span><div id="146c030b24dd804ebae1d9cc0e24b6f3" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd804ebae1d9cc0e24b6f3" title="All-Reduce"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">All-Reduce</span></span></h4><figure class="notion-asset-wrapper notion-asset-wrapper-image notion-block-146c030b24dd80668e4dd4bf8edcbf37"><div style="position:relative;display:flex;justify-content:center;align-self:center;width:650px;max-width:100%;flex-direction:column"><img style="object-fit:cover" src="https://www.notion.so/image/https%3A%2F%2Fprod-files-secure.s3.us-west-2.amazonaws.com%2F7599b067-d45c-4f81-9480-91921f5c996e%2Fda1b1940-ab5e-4d5d-bb54-19557f673b73%2Fimage.png?table=block&amp;id=146c030b-24dd-8066-8e4d-d4bf8edcbf37&amp;t=146c030b-24dd-8066-8e4d-d4bf8edcbf37&amp;width=650&amp;cache=v2" alt="notion image" loading="lazy" decoding="async"/></div></figure><div class="notion-text notion-block-146c030b24dd80a28b8dcc97d7fc5d32">Reduce + Broadcast</div><div class="notion-text notion-block-146c030b24dd802c8effc6b6c3536cf5">或者ReduceScatter + AllGather</div><div class="notion-text notion-block-146c030b24dd809d9c16fd3dd4e2e534">初始：每个GPU上有不同的数据</div><div class="notion-text notion-block-146c030b24dd8070a6e5cfaef43dddf4">最终：每个GPU有规约后的结果</div><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-146c030b24dd80e8b3d5c63e75344b01" data-id="146c030b24dd80e8b3d5c63e75344b01"><span><div id="146c030b24dd80e8b3d5c63e75344b01" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd80e8b3d5c63e75344b01" title="Reduce-Scatter"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">Reduce-Scatter</span></span></h4><figure class="notion-asset-wrapper notion-asset-wrapper-image notion-block-146c030b24dd80e1bb98c949db3b029f"><div style="position:relative;display:flex;justify-content:center;align-self:center;width:650px;max-width:100%;flex-direction:column"><img style="object-fit:cover" src="https://www.notion.so/image/https%3A%2F%2Fprod-files-secure.s3.us-west-2.amazonaws.com%2F7599b067-d45c-4f81-9480-91921f5c996e%2F7c3b87d1-c1e0-4705-af08-eb822677eff6%2Fimage.png?table=block&amp;id=146c030b-24dd-80e1-bb98-c949db3b029f&amp;t=146c030b-24dd-80e1-bb98-c949db3b029f&amp;width=650&amp;cache=v2" alt="notion image" loading="lazy" decoding="async"/></div></figure><div class="notion-text notion-block-146c030b24dd80df9c74f934849e5080">初始：每个GPU有不同的数据</div><div class="notion-text notion-block-146c030b24dd803eb7abde3f6c5d3d02">中间步骤：每个GPU上的数据分维度进行规约操作，然后不同维度Scatter分发到不同GPU上</div><div class="notion-text notion-block-146c030b24dd80a4b323dbcd720158f8">最终：每个GPU上有不同维度的规约之后的操作</div><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-146c030b24dd80e5ada9e47740ecc265" data-id="146c030b24dd80e5ada9e47740ecc265"><span><div id="146c030b24dd80e5ada9e47740ecc265" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd80e5ada9e47740ecc265" title="All-to-All"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">All-to-All</span></span></h4><div class="notion-text notion-block-146c030b24dd804795acd4ca58bd52c5">Gather-Scatter</div><div class="notion-text notion-block-146c030b24dd801ca10fe9890c170f76">初始：每个GPU上有不同的数据</div><div class="notion-text notion-block-146c030b24dd80fe8f84c19be1789c53">中间步骤：每个GPU上的数据分维度进行Gather操作，然后将不同维度Gather在一起的数据Scatter分发到不同的GPU上</div><div class="notion-text notion-block-146c030b24dd80918a68dad881cec13b">最终：每个GPU上有不同维度合并的数据（注意：不是规约中的SUM）</div><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-146c030b24dd8074a595f8c6b751f67b" data-id="146c030b24dd8074a595f8c6b751f67b"><span><div id="146c030b24dd8074a595f8c6b751f67b" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd8074a595f8c6b751f67b" title="原语实现细节"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">原语实现细节</span></span></h3><div class="notion-text notion-block-146c030b24dd80929e5ff231b9dc4a50">NCCL通常使用一个或多个环形拓扑，来实现上述通信原语，能够得到最大化的带宽利用率。在nccl-test工具中，默认拓扑就是Ring。</div><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-146c030b24dd8068839aee884411ede3" data-id="146c030b24dd8068839aee884411ede3"><span><div id="146c030b24dd8068839aee884411ede3" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd8068839aee884411ede3" title="Broadcast"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">Broadcast</span></span></h4><div class="notion-text notion-block-146c030b24dd80449dccd72b465142fd">数据传输流程：GPU0→GPU1→GPU2→GPU3</div><div class="notion-text notion-block-146c030b24dd80eba9c2c61920da3ec9">优化：分块传输可提高最终速度</div><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-146c030b24dd80ba962dd3fa4c1de12b" data-id="146c030b24dd80ba962dd3fa4c1de12b"><span><div id="146c030b24dd80ba962dd3fa4c1de12b" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd80ba962dd3fa4c1de12b" title="All-Reduce"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">All-Reduce</span></span></h4><div class="notion-text notion-block-146c030b24dd802eb374c31a58d54296">初始：每个GPU都有不同的数据（维度相同，数据不同）</div><div class="notion-text notion-block-146c030b24dd804f9243e8c4d1393e1b">步骤一：多个step进行，每个step都有两个GPU之间进行一个维度的Reduce操作</div><div class="notion-text notion-block-146c030b24dd80c397c0c3f6a0251eab">中间结果：步骤一多个step之后，每个GPU都有一个维度的数据是Reduce之后。</div><div class="notion-text notion-block-146c030b24dd80c1aa6ce96217089aac">步骤二：每个GPU将Reduce之后的维度的数据Broadcast到其他GPU上</div><div class="notion-text notion-block-146c030b24dd80f5b5f0f40460122e4d">最终：每个GPU都有不同维度Reduce之后的结果（完全相同）</div><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-146c030b24dd801f9b33e67099aa8c9c" data-id="146c030b24dd801f9b33e67099aa8c9c"><span><div id="146c030b24dd801f9b33e67099aa8c9c" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd801f9b33e67099aa8c9c" title="All-Gather"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">All-Gather</span></span></h4><div class="notion-text notion-block-146c030b24dd808ca0b1d7de6b3cb8c1">初始：每个GPU都有不同的数据</div><div class="notion-text notion-block-146c030b24dd805890c8f099616eced2">中间步骤：多个step，每个step都有GPU中间数据的tranfer</div><div class="notion-text notion-block-146c030b24dd80fe8c8fc3eff935ba46">最终，每个GPU都有完整的数据</div><h2 class="notion-h notion-h1 notion-h-indent-0 notion-block-145c030b24dd8043b168cbaa31ce4286" data-id="145c030b24dd8043b168cbaa31ce4286"><span><div id="145c030b24dd8043b168cbaa31ce4286" class="notion-header-anchor"></div><a class="notion-hash-link" href="#145c030b24dd8043b168cbaa31ce4286" title="并行方式"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">并行方式</span></span></h2><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-145c030b24dd80d78e55f38a596d6574" data-id="145c030b24dd80d78e55f38a596d6574"><span><div id="145c030b24dd80d78e55f38a596d6574" class="notion-header-anchor"></div><a class="notion-hash-link" href="#145c030b24dd80d78e55f38a596d6574" title="数据并行（DP）"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">数据并行（DP）</span></span></h3><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-145c030b24dd80df88c8eade0a8b1e75" data-id="145c030b24dd80df88c8eade0a8b1e75"><span><div id="145c030b24dd80df88c8eade0a8b1e75" class="notion-header-anchor"></div><a class="notion-hash-link" href="#145c030b24dd80df88c8eade0a8b1e75" title="训练场景"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">训练场景</span></span></h4><div class="notion-text notion-block-145c030b24dd807fbe2ac653c545587a">每个 GPU 都运行相同的模型代码，而数据集被拆分为多份分配给不同的 GPU 进行训练。每轮迭代完成后，需要通过 A<b>llReduce</b> 操作进行同步。
AllReduce = ReduceScatter + AllGather</div><div class="notion-text notion-block-146c030b24dd80528b1dd8a455505cab">或者AllReduce = Reduce + Broadcast</div><div class="notion-text notion-block-145c030b24dd8095b8cbd89b98092a19">初始：模型参数分散在各个GPU上</div><div class="notion-text notion-block-146c030b24dd80b89262eef6a3224011">步骤一：All-Gather操作让每个GPU上有完整的模型参数</div><div class="notion-text notion-block-146c030b24dd803e983af12a662cc547">步骤二：每个GPU（模型）上针对不同数据进行Forward Pass</div><div class="notion-text notion-block-146c030b24dd806a90d1d359ceeb30de">步骤三：每个GPU（模型）针对本地Backward Pass进行更新</div><div class="notion-text notion-block-146c030b24dd80498876fe061b7a09c6">步骤四：针对所有GPU做All-Reduce操作，进行梯度累计更</div><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-145c030b24dd80409579e264692b2f27" data-id="145c030b24dd80409579e264692b2f27"><span><div id="145c030b24dd80409579e264692b2f27" class="notion-header-anchor"></div><a class="notion-hash-link" href="#145c030b24dd80409579e264692b2f27" title="张量并行（TP）"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">张量并行（TP）</span></span></h3><div class="notion-text notion-block-146c030b24dd80ecb5fcd149d2d384f1">利用多头注意力机制进行张量并行操作，每个头计算独立分配到不同的GPU上，通过All-Reduce操作同步矩阵乘法结果。</div><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-146c030b24dd801bb669f49a21ee26e7" data-id="146c030b24dd801bb669f49a21ee26e7"><span><div id="146c030b24dd801bb669f49a21ee26e7" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd801bb669f49a21ee26e7" title="流水线并行（PP）"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">流水线并行（PP）</span></span></h3><div class="notion-text notion-block-146c030b24dd8085bfa9d60261b5f81b">将模型的多个层（stage）分在不同的GPU上，通过点对点通信</div><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-146c030b24dd80d9b80ff99d8969465b" data-id="146c030b24dd80d9b80ff99d8969465b"><span><div id="146c030b24dd80d9b80ff99d8969465b" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd80d9b80ff99d8969465b" title="序列并行（SP）"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">序列并行（SP）</span></span></h3><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-146c030b24dd8032b69fcf8272c147ad" data-id="146c030b24dd8032b69fcf8272c147ad"><span><div id="146c030b24dd8032b69fcf8272c147ad" class="notion-header-anchor"></div><a class="notion-hash-link" href="#146c030b24dd8032b69fcf8272c147ad" title="专家并行（）"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">专家并行（）</span></span></h3><h2 class="notion-h notion-h1 notion-h-indent-0 notion-block-145c030b24dd80ce866acd9c7a7fe43b" data-id="145c030b24dd80ce866acd9c7a7fe43b"><span><div id="145c030b24dd80ce866acd9c7a7fe43b" class="notion-header-anchor"></div><a class="notion-hash-link" href="#145c030b24dd80ce866acd9c7a7fe43b" title="参考文章"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">参考文章</span></span></h2><div class="notion-row"><a target="_blank" rel="noopener noreferrer" class="notion-bookmark notion-block-145c030b24dd8088b377f7458514b67b" href="https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html"><div><div class="notion-bookmark-title">Collective Operations — NCCL 2.23.4 documentation</div><div class="notion-bookmark-description">Collective operations have to be called for each rank (hence CUDA device), using the same count and the same datatype, to form a complete collective operation.
Failure to do so will result in undefined behavior, including hangs, crashes, or data corruption.</div><div class="notion-bookmark-link"><div class="notion-bookmark-link-text">https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html</div></div></div></a></div><div class="notion-row"><a target="_blank" rel="noopener noreferrer" class="notion-bookmark notion-block-145c030b24dd806c95fae579eddb4a97" href="https://aijishu.com/a/1060000000483892"><div><div class="notion-bookmark-title">一文讲清 NCCL 集合通信原理与优化 - 极术社区 - 连接开发者与智能计算生态</div><div class="notion-bookmark-description">大模型分布式训练往往需要上千乃至上万 GPU 卡进行超大规模并行训练，是典型的计算密集型和通信密集型场景。</div><div class="notion-bookmark-link"><div class="notion-bookmark-link-icon"><img src="https://www.notion.so/image/https%3A%2F%2Fcdn-assets.aijishu.com%2Fv-8647875a%2Fpublic%2Ffavicons%2Fapple-touch-icon.png?table=block&amp;id=145c030b-24dd-806c-95fa-e579eddb4a97&amp;t=145c030b-24dd-806c-95fa-e579eddb4a97" alt="一文讲清 NCCL 集合通信原理与优化 - 极术社区 - 连接开发者与智能计算生态" loading="lazy" decoding="async"/></div><div class="notion-bookmark-link-text">https://aijishu.com/a/1060000000483892</div></div></div></a></div><h4 class="notion-h notion-h3 notion-h-indent-1 notion-block-22cc030b24dd81f5b3b3fd63152c82e7" data-id="22cc030b24dd81f5b3b3fd63152c82e7"><span><div id="22cc030b24dd81f5b3b3fd63152c82e7" class="notion-header-anchor"></div><a class="notion-hash-link" href="#22cc030b24dd81f5b3b3fd63152c82e7" title="专家并行 (Mixture of Experts, MoE)"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">专家并行 (Mixture of Experts, MoE)</span></span></h4><div class="notion-text notion-block-22cc030b24dd8191a1afef905ca5ae64">MoE是一种模型架构，它将模型分为多个“专家”网络，每个专家专注于处理特定类型的输入。一个门控网络（Gating Network）会根据输入动态地决定激活哪个或哪些专家。这种方式允许模型规模变得非常大，但每次前向传播时只使用其中的一小部分计算资源，从而在不显著增加计算成本的情况下提升模型容量和性能。</div></main></div>]]></content:encoded>
        </item>
        <item>
            <title><![CDATA[AI]]></title>
            <link>https://notion.zhaozhan.site/article/14ec030b-24dd-800c-ac0b-e0c0d33f4ba2</link>
            <guid>https://notion.zhaozhan.site/article/14ec030b-24dd-800c-ac0b-e0c0d33f4ba2</guid>
            <pubDate>Sat, 30 Nov 2024 00:00:00 GMT</pubDate>
            <content:encoded><![CDATA[<div id="notion-article" class="mx-auto overflow-hidden "><main class="notion light-mode notion-page notion-block-14ec030b24dd800cac0be0c0d33f4ba2"><div class="notion-viewport"></div><a class="notion-page-link notion-block-14ec030b24dd80efab6ed778158c6ad2" href="/14ec030b24dd80efab6ed778158c6ad2"><span class="notion-page-title"><div class="notion-page-icon-inline notion-page-icon-image"><svg class="notion-page-title-icon notion-page-icon" alt="Agent" viewBox="0 0 30 30" width="16"><path d="M16,1H4v28h22V11L16,1z M16,3.828L23.172,11H16V3.828z M24,27H6V3h8v10h10V27z M8,17h14v-2H8V17z M8,21h14v-2H8V21z M8,25h14v-2H8V25z"></path></svg></div><span class="notion-page-title-text">Agent</span></span></a><a class="notion-page-link notion-block-1b0c030b24dd804fb4b7c53fa4aecd4d" href="/1b0c030b24dd804fb4b7c53fa4aecd4d"><span class="notion-page-title"><div class="notion-page-icon-inline notion-page-icon-image"><svg class="notion-page-title-icon notion-page-icon" alt="Transformer" viewBox="0 0 30 30" width="16"><path d="M16,1H4v28h22V11L16,1z M16,3.828L23.172,11H16V3.828z M24,27H6V3h8v10h10V27z M8,17h14v-2H8V17z M8,21h14v-2H8V21z M8,25h14v-2H8V25z"></path></svg></div><span class="notion-page-title-text">Transformer</span></span></a><a class="notion-page-link notion-block-22cc030b24dd80138248e60b11f5a534" href="/22cc030b24dd80138248e60b11f5a534"><span class="notion-page-title"><div class="notion-page-icon-inline notion-page-icon-image"><svg class="notion-page-title-icon notion-page-icon" alt="RAG" viewBox="0 0 30 30" width="16"><path d="M16,1H4v28h22V11L16,1z M16,3.828L23.172,11H16V3.828z M24,27H6V3h8v10h10V27z M8,17h14v-2H8V17z M8,21h14v-2H8V21z M8,25h14v-2H8V25z"></path></svg></div><span class="notion-page-title-text">RAG</span></span></a><a class="notion-page-link notion-block-22cc030b24dd8050a1f1ee8a9069f142" href="/22cc030b24dd8050a1f1ee8a9069f142"><span class="notion-page-title"><div class="notion-page-icon-inline notion-page-icon-image"><svg class="notion-page-title-icon notion-page-icon" alt="Deep Learning" viewBox="0 0 30 30" width="16"><path d="M16,1H4v28h22V11L16,1z M16,3.828L23.172,11H16V3.828z M24,27H6V3h8v10h10V27z M8,17h14v-2H8V17z M8,21h14v-2H8V21z M8,25h14v-2H8V25z"></path></svg></div><span class="notion-page-title-text">Deep Learning</span></span></a></main></div>]]></content:encoded>
        </item>
        <item>
            <title><![CDATA[NCCL]]></title>
            <link>https://notion.zhaozhan.site/article/151c030b-24dd-80fd-9402-d535f121f0b6</link>
            <guid>https://notion.zhaozhan.site/article/151c030b-24dd-80fd-9402-d535f121f0b6</guid>
            <pubDate>Tue, 03 Dec 2024 00:00:00 GMT</pubDate>
            <content:encoded><![CDATA[<div id="notion-article" class="mx-auto overflow-hidden "><main class="notion light-mode notion-page notion-block-151c030b24dd80fd9402d535f121f0b6"><div class="notion-viewport"></div><a class="notion-page-link notion-block-145c030b24dd8028a995ea92c4c5f7b1" href="/145c030b24dd8028a995ea92c4c5f7b1"><span class="notion-page-title"><div class="notion-page-icon-inline notion-page-icon-image"><svg class="notion-page-title-icon notion-page-icon" alt="NCCL通信" viewBox="0 0 30 30" width="16"><path d="M16,1H4v28h22V11L16,1z M16,3.828L23.172,11H16V3.828z M24,27H6V3h8v10h10V27z M8,17h14v-2H8V17z M8,21h14v-2H8V21z M8,25h14v-2H8V25z"></path></svg></div><span class="notion-page-title-text">NCCL通信</span></span></a><a class="notion-page-link notion-block-151c030b24dd80a79e19dbcb1ccd11c8" href="/151c030b24dd80a79e19dbcb1ccd11c8"><span class="notion-page-title"><div class="notion-page-icon-inline notion-page-icon-image"><svg class="notion-page-title-icon notion-page-icon" alt="nccl-test" viewBox="0 0 30 30" width="16"><path d="M16,1H4v28h22V11L16,1z M16,3.828L23.172,11H16V3.828z M24,27H6V3h8v10h10V27z M8,17h14v-2H8V17z M8,21h14v-2H8V21z M8,25h14v-2H8V25z"></path></svg></div><span class="notion-page-title-text">nccl-test</span></span></a><div class="notion-blank notion-block-22cc030b24dd804ab976e80df9797422"> </div><h2 class="notion-h notion-h1 notion-h-indent-0 notion-block-22cc030b24dd815683dcf6f17d6a608e" data-id="22cc030b24dd815683dcf6f17d6a608e"><span><div id="22cc030b24dd815683dcf6f17d6a608e" class="notion-header-anchor"></div><a class="notion-hash-link" href="#22cc030b24dd815683dcf6f17d6a608e" title="NVIDIA Collective Communications Library (NCCL)"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">NVIDIA Collective Communications Library (NCCL)</span></span></h2><div class="notion-text notion-block-22cc030b24dd810c9bf7f0d037637465">NCCL 是 NVIDIA 开发的一个高性能库，旨在为多 GPU 和多节点环境中的深度学习框架提供优化的集合通信原语。它通过利用 NVIDIA GPU 的高速互联技术（如 NVLink）和高效的通信算法，显著加速了分布式训练的性能。</div><div class="notion-text notion-block-22cc030b24dd8152b1ddefe3b6e005ee">NCCL 提供了多种集合通信操作，包括：</div><ul class="notion-list notion-list-disc notion-block-22cc030b24dd81c4a3ecc4410b98bade"><li>All-Reduce: 所有参与者贡献数据，并接收所有数据的规约结果。</li></ul><ul class="notion-list notion-list-disc notion-block-22cc030b24dd8130b9bdfd6f86655ff9"><li>All-Gather: 所有参与者贡献数据，并接收所有数据的完整集合。</li></ul><ul class="notion-list notion-list-disc notion-block-22cc030b24dd816fbaaef79c26bc9ed1"><li>Reduce-Scatter: 所有参与者贡献数据，并接收部分规约结果。</li></ul><ul class="notion-list notion-list-disc notion-block-22cc030b24dd815ebfedcea7b7bf465e"><li>Broadcast: 一个根节点将数据发送给所有其他参与者。</li></ul><div class="notion-text notion-block-22cc030b24dd81f6b11eda05636e25a3">这些操作是分布式深度学习训练中梯度同步和数据分发的核心。</div></main></div>]]></content:encoded>
        </item>
        <item>
            <title><![CDATA[nccl-test]]></title>
            <link>https://notion.zhaozhan.site/article/151c030b-24dd-80a7-9e19-dbcb1ccd11c8</link>
            <guid>https://notion.zhaozhan.site/article/151c030b-24dd-80a7-9e19-dbcb1ccd11c8</guid>
            <pubDate>Tue, 03 Dec 2024 00:00:00 GMT</pubDate>
            <content:encoded><![CDATA[<div id="notion-article" class="mx-auto overflow-hidden "><main class="notion light-mode notion-page notion-block-151c030b24dd80a79e19dbcb1ccd11c8"><div class="notion-viewport"></div><h2 class="notion-h notion-h1 notion-h-indent-0 notion-block-151c030b24dd80f48e16e75774b31e3a" data-id="151c030b24dd80f48e16e75774b31e3a"><span><div id="151c030b24dd80f48e16e75774b31e3a" class="notion-header-anchor"></div><a class="notion-hash-link" href="#151c030b24dd80f48e16e75774b31e3a" title="参考文章"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">参考文章</span></span></h2><div class="notion-row"><a target="_blank" rel="noopener noreferrer" class="notion-bookmark notion-block-151c030b24dd8040b3a9ed0e2c006150" href="https://zhuanlan.zhihu.com/p/682530828#:~:text=NCCL%E6%B5%8B%E8%AF%95%E4%BE%9D%E8%B5%96%E4%BA%8EMPI%E4%BB%A5%E5%9C%A8%E5%A4%9A%E4%B8%AA%E8%BF%9B%E7%A8%8B%E5%92%8C%E5%A4%9A%E4%B8%AA%E8%8A%82%E7%82%B9%E4%B8%8A%E5%B7%A5%E4%BD%9C%E3%80%82%20%E5%A6%82%E6%9E%9C%E4%BD%A0%E6%83%B3%E4%BD%BF%E7%94%A8MPI%E6%94%AF%E6%8C%81%E7%BC%96%E8%AF%91%E8%BF%99%E4%BA%9B%E6%B5%8B%E8%AF%95%EF%BC%8C%E9%9C%80%E8%A6%81%E5%B0%86%E7%8E%AF%E5%A2%83%E5%8F%98%E9%87%8FMPI%E8%AE%BE%E7%BD%AE%E4%B8%BA1%EF%BC%8C%E5%B9%B6%E5%B0%86MPI_HOME%E8%AE%BE%E7%BD%AE%E4%B8%BAMPI%E5%AE%89%E8%A3%85%E7%9A%84%E8%B7%AF%E5%BE%84%E3%80%82%20cd%20nccl-tests,%23%20%E7%BC%96%E8%AF%91%E6%94%AF%E6%8C%81mpi%E7%9A%84test%20%E6%88%90%E5%8A%9F%E5%90%8E%E4%BC%9A%E5%9C%A8build%E7%9B%AE%E5%BD%95%E4%B8%8B%E7%94%9F%E6%88%90%E5%8F%AF%E6%89%A7%E8%A1%8C%E6%96%87%E4%BB%B6%20NCCL%E6%B5%8B%E8%AF%95%E5%8F%AF%E4%BB%A5%E5%9C%A8%E5%A4%9A%E4%B8%AA%E8%BF%9B%E7%A8%8B%E3%80%81%E5%A4%9A%E4%B8%AA%E7%BA%BF%E7%A8%8B%E5%92%8C%E6%AF%8F%E4%B8%AA%E7%BA%BF%E7%A8%8B%E4%B8%8A%E7%9A%84%E5%A4%9A%E4%B8%AACUDA%E8%AE%BE%E5%A4%87%E4%B8%8A%E8%BF%90%E8%A1%8C%E3%80%82"><div><div class="notion-bookmark-title">zhuanlan.zhihu.com</div><div class="notion-bookmark-link"><div class="notion-bookmark-link-text">https://zhuanlan.zhihu.com/p/682530828#:~:text=NCCL%E6%B5%8B%E8%AF%95%E4%BE%9D%E8%B5%96%E4%BA%8EMPI%E4%BB%A5%E5%9C%A8%E5%A4%9A%E4%B8%AA%E8%BF%9B%E7%A8%8B%E5%92%8C%E5%A4%9A%E4%B8%AA%E8%8A%82%E7%82%B9%E4%B8%8A%E5%B7%A5%E4%BD%9C%E3%80%82%20%E5%A6%82%E6%9E%9C%E4%BD%A0%E6%83%B3%E4%BD%BF%E7%94%A8MPI%E6%94%AF%E6%8C%81%E7%BC%96%E8%AF%91%E8%BF%99%E4%BA%9B%E6%B5%8B%E8%AF%95%EF%BC%8C%E9%9C%80%E8%A6%81%E5%B0%86%E7%8E%AF%E5%A2%83%E5%8F%98%E9%87%8FMPI%E8%AE%BE%E7%BD%AE%E4%B8%BA1%EF%BC%8C%E5%B9%B6%E5%B0%86MPI_HOME%E8%AE%BE%E7%BD%AE%E4%B8%BAMPI%E5%AE%89%E8%A3%85%E7%9A%84%E8%B7%AF%E5%BE%84%E3%80%82%20cd%20nccl-tests,%23%20%E7%BC%96%E8%AF%91%E6%94%AF%E6%8C%81mpi%E7%9A%84test%20%E6%88%90%E5%8A%9F%E5%90%8E%E4%BC%9A%E5%9C%A8build%E7%9B%AE%E5%BD%95%E4%B8%8B%E7%94%9F%E6%88%90%E5%8F%AF%E6%89%A7%E8%A1%8C%E6%96%87%E4%BB%B6%20NCCL%E6%B5%8B%E8%AF%95%E5%8F%AF%E4%BB%A5%E5%9C%A8%E5%A4%9A%E4%B8%AA%E8%BF%9B%E7%A8%8B%E3%80%81%E5%A4%9A%E4%B8%AA%E7%BA%BF%E7%A8%8B%E5%92%8C%E6%AF%8F%E4%B8%AA%E7%BA%BF%E7%A8%8B%E4%B8%8A%E7%9A%84%E5%A4%9A%E4%B8%AACUDA%E8%AE%BE%E5%A4%87%E4%B8%8A%E8%BF%90%E8%A1%8C%E3%80%82</div></div></div></a></div><div class="notion-blank notion-block-151c030b24dd8006bf87ec86c0331a1e"> </div><h2 class="notion-h notion-h1 notion-h-indent-0 notion-block-22cc030b24dd81a6ab2aedd3d4f1257f" data-id="22cc030b24dd81a6ab2aedd3d4f1257f"><span><div id="22cc030b24dd81a6ab2aedd3d4f1257f" class="notion-header-anchor"></div><a class="notion-hash-link" href="#22cc030b24dd81a6ab2aedd3d4f1257f" title="nccl-test 工具介绍"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">nccl-test 工具介绍</span></span></h2><div class="notion-text notion-block-22cc030b24dd8138bbd7e800b14e8be3">nccl-tests 是 NVIDIA 官方提供的一套用于测试和基准化 NCCL 性能的工具集。它包含了针对各种集合通信原语（如 All-Reduce, All-Gather, Broadcast 等）的测试程序，可以帮助用户评估多 GPU 或多节点环境下的通信带宽和延迟。</div><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-22cc030b24dd811a9b30de14b69e4277" data-id="22cc030b24dd811a9b30de14b69e4277"><span><div id="22cc030b24dd811a9b30de14b69e4277" class="notion-header-anchor"></div><a class="notion-hash-link" href="#22cc030b24dd811a9b30de14b69e4277" title="基本使用示例"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">基本使用示例</span></span></h3><div class="notion-text notion-block-22cc030b24dd810a8b23cc52e209976c">编译 nccl-tests (通常在 NCCL 源码目录下):</div><div class="notion-text notion-block-22cc030b24dd816f9b98c630ec1fe132">运行 All-Reduce 测试 (例如，测试 2个GPU，数据大小为 8M):</div><div class="notion-text notion-block-22cc030b24dd816db74afb01a25665ff">运行 All-Gather 测试 (例如，测试 4个GPU，数据大小为 16M):</div><div class="notion-text notion-block-22cc030b24dd8117ad14d2fe52062b46">更多选项和测试类型，请参考 nccl-tests 的官方文档或运行程序时使用 --help 参数。</div></main></div>]]></content:encoded>
        </item>
        <item>
            <title><![CDATA[GPU通信方式]]></title>
            <link>https://notion.zhaozhan.site/article/152c030b-24dd-80e3-bdcb-e76d31dd38c3</link>
            <guid>https://notion.zhaozhan.site/article/152c030b-24dd-80e3-bdcb-e76d31dd38c3</guid>
            <pubDate>Wed, 04 Dec 2024 00:00:00 GMT</pubDate>
            <content:encoded><![CDATA[<div id="notion-article" class="mx-auto overflow-hidden "><main class="notion light-mode notion-page notion-block-152c030b24dd80e3bdcbe76d31dd38c3"><div class="notion-viewport"></div><h2 class="notion-h notion-h1 notion-h-indent-0 notion-block-152c030b24dd80f09c1bfff3aa921cbb" data-id="152c030b24dd80f09c1bfff3aa921cbb"><span><div id="152c030b24dd80f09c1bfff3aa921cbb" class="notion-header-anchor"></div><a class="notion-hash-link" href="#152c030b24dd80f09c1bfff3aa921cbb" title="单机多卡"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">单机多卡</span></span></h2><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-152c030b24dd80ac86a3f9a1ac4711a9" data-id="152c030b24dd80ac86a3f9a1ac4711a9"><span><div id="152c030b24dd80ac86a3f9a1ac4711a9" class="notion-header-anchor"></div><a class="notion-hash-link" href="#152c030b24dd80ac86a3f9a1ac4711a9" title="GPU Direct"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">GPU Direct</span></span></h3><div class="notion-text notion-block-152c030b24dd8010810acff46b271076">GPU Direct 是 NVIDIA 开发的一项技术，可实现 GPU 与其他设备（例如网络接口卡 (NIC) 和存储设备）之间的直接通信和数据传输，而不涉及 CPU。</div><h4 class="notion-h notion-h3 notion-h-indent-2 notion-block-152c030b24dd800f9045c6bd9a220dc8" data-id="152c030b24dd800f9045c6bd9a220dc8"><span><div id="152c030b24dd800f9045c6bd9a220dc8" class="notion-header-anchor"></div><a class="notion-hash-link" href="#152c030b24dd800f9045c6bd9a220dc8" title="GPUDirect Storge"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">GPUDirect Storge</span></span></h4><div class="notion-text notion-block-152c030b24dd806280c4c245ed6a42de">GPUDirect Storage 允许存储设备和 GPU 之间进行直接数据传输，绕过 CPU，减少数据传输的延迟和 CPU 开销。</div><div class="notion-text notion-block-152c030b24dd80f09e9dcabba004c4cb">通过 GPUDirect Storage，GPU 可以直接从存储设备（如固态硬盘（SSD）或非易失性内存扩展（NVMe）驱动器）访问数据，而无需将数据先复制到 CPU 的内存中。这种直接访问能够实现更快的数据传输速度，并更高效地利用 GPU 资源。</div><blockquote class="notion-quote notion-block-152c030b24dd80afbd8aee4f4de61cab"><div>NVMe 全称 Non-Volatile Memory Express，中文译为非易失性内存主机控制器接口规范。它是一种专为闪存和下一代固态硬盘（SSD）设计的高性能存储协议。NVMe 允许 SSD 直接通过 PCIe 总线与 CPU 通信，绕过了传统 SATA 接口的瓶颈，大幅提升数据读写速度。</div></blockquote><blockquote class="notion-quote notion-block-152c030b24dd80fdbff8da7c09def08f"><div>闪存（Flash Memory）是一种非易失性（Non-Volatile）的计算机存储芯片, 闪存主要分为两种类型：
• <b>NAND Flash（与非闪存）：</b> 容量大、成本低、擦写速度快，但可靠性相对较低，主要用于大容量存储设备，如固态硬盘（SSD）、U盘、存储卡等。
• <b>NOR Flash（或非闪存）：</b> 容量较小、成本较高、擦写速度较慢，但可靠性高，可以直接执行代码（XIP, Execute In Place），主要用于存储启动代码、固件等，常见于嵌入式系统、手机等设备。</div></blockquote><blockquote class="notion-quote notion-block-152c030b24dd80f0b002df8f659ef3c3"><div>SATA，全称 Serial ATA（Serial Advanced Technology Attachment），中文译为串行高级技术附件，是一种计算机总线接口，主要用于连接主机系统（如计算机主板）与存储设备（如硬盘、光驱）。</div></blockquote><figure class="notion-asset-wrapper notion-asset-wrapper-image notion-block-152c030b24dd80d1b493ef1673d64c1f"><div style="position:relative;display:flex;justify-content:center;align-self:center;width:100%;max-width:100%;flex-direction:column;height:100%"><img style="object-fit:cover" src="https://www.notion.so/image/https%3A%2F%2Fprod-files-secure.s3.us-west-2.amazonaws.com%2F7599b067-d45c-4f81-9480-91921f5c996e%2Fb1dac499-7ea1-4cb4-b95d-1131fa0bbd6f%2Fimage.png?table=block&amp;id=152c030b-24dd-80d1-b493-ef1673d64c1f&amp;t=152c030b-24dd-80d1-b493-ef1673d64c1f&amp;width=707.9896240234375&amp;cache=v2" alt="notion image" loading="lazy" decoding="async"/></div></figure><figure class="notion-asset-wrapper notion-asset-wrapper-image notion-block-152c030b24dd804d990ded1983614a73"><div style="position:relative;display:flex;justify-content:center;align-self:center;width:100%;max-width:100%;flex-direction:column;height:100%"><img style="object-fit:cover" src="https://www.notion.so/image/https%3A%2F%2Fprod-files-secure.s3.us-west-2.amazonaws.com%2F7599b067-d45c-4f81-9480-91921f5c996e%2F7f0b6e0a-510e-41f9-9d42-0d7f5febe3b5%2Fimage.png?table=block&amp;id=152c030b-24dd-804d-990d-ed1983614a73&amp;t=152c030b-24dd-804d-990d-ed1983614a73&amp;width=707.9791870117188&amp;cache=v2" alt="notion image" loading="lazy" decoding="async"/></div></figure><div class="notion-text notion-block-152c030b24dd8000925dee54bfee4470">##</div><ul class="notion-list notion-list-disc notion-block-152c030b24dd80aca36fd74881eedc7c"><li>GPUDirect RDMA</li></ul><ul class="notion-list notion-list-disc notion-block-152c030b24dd80a880b1e85bd5d70b77"><li>GPUDirect P2P</li></ul><ul class="notion-list notion-list-disc notion-block-152c030b24dd8041b337c841b07a6f74"><li>GPUDirect 视频</li></ul><div class="notion-blank notion-block-152c030b24dd8052b443cc2a08b26d78"> </div><div class="notion-blank notion-block-152c030b24dd8021b967d1566871b4ce"> </div><h2 class="notion-h notion-h1 notion-h-indent-0 notion-block-152c030b24dd80139f36e8057dcd338a" data-id="152c030b24dd80139f36e8057dcd338a"><span><div id="152c030b24dd80139f36e8057dcd338a" class="notion-header-anchor"></div><a class="notion-hash-link" href="#152c030b24dd80139f36e8057dcd338a" title="参考文章"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">参考文章</span></span></h2><div class="notion-row"><a target="_blank" rel="noopener noreferrer" class="notion-bookmark notion-block-152c030b24dd809baef2d3718401f55c" href="https://www.cnblogs.com/upyun/p/17679500.html"><div><div class="notion-bookmark-title">聊透 GPU 通信技术——GPU Direct、NVLink、RDMA - 又拍云 - 博客园</div><div class="notion-bookmark-description">最近人工智能大火，AI 应用所涉及的技术能力包括语音、图像、视频、NLP 等多方面，而这些都需要强大的计算资源支持。AI 技术对算力的需求是非常庞大的，虽然 GPU 的计算能力在持续提升，但是对于 AI 来说，单卡的计算能力就算再强，也是有极限的，这就需要多 GPU 组合。而 GPU 多卡的组合，主</div><div class="notion-bookmark-link"><div class="notion-bookmark-link-icon"><img src="https://www.notion.so/image/https%3A%2F%2Fassets.cnblogs.com%2Ffavicon_v3_2.ico?table=block&amp;id=152c030b-24dd-809b-aef2-d3718401f55c&amp;t=152c030b-24dd-809b-aef2-d3718401f55c" alt="聊透 GPU 通信技术——GPU Direct、NVLink、RDMA - 又拍云 - 博客园" loading="lazy" decoding="async"/></div><div class="notion-bookmark-link-text">https://www.cnblogs.com/upyun/p/17679500.html</div></div></div></a></div><div class="notion-blank notion-block-152c030b24dd80039f18fdfd04f6a7e7"> </div><h2 class="notion-h notion-h1 notion-h-indent-0 notion-block-22cc030b24dd81538175f507b1a540a6" data-id="22cc030b24dd81538175f507b1a540a6"><span><div id="22cc030b24dd81538175f507b1a540a6" class="notion-header-anchor"></div><a class="notion-hash-link" href="#22cc030b24dd81538175f507b1a540a6" title="单机多卡通信"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">单机多卡通信</span></span></h2><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-22cc030b24dd8198977cc71bce5579db" data-id="22cc030b24dd8198977cc71bce5579db"><span><div id="22cc030b24dd8198977cc71bce5579db" class="notion-header-anchor"></div><a class="notion-hash-link" href="#22cc030b24dd8198977cc71bce5579db" title="GPUDirect P2P"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">GPUDirect P2P</span></span></h3><div class="notion-text notion-block-22cc030b24dd81b2b550dc148950ce85">GPUDirect P2P (Peer-to-Peer) 允许同一台服务器上的不同 GPU 之间直接进行数据传输，而无需通过 CPU 或系统内存。这显著减少了数据传输的延迟和 CPU 的开销，是单机多卡通信中最常用的优化技术。</div><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-22cc030b24dd819ebbeccc0477459f7a" data-id="22cc030b24dd819ebbeccc0477459f7a"><span><div id="22cc030b24dd819ebbeccc0477459f7a" class="notion-header-anchor"></div><a class="notion-hash-link" href="#22cc030b24dd819ebbeccc0477459f7a" title="NVLink"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">NVLink</span></span></h3><div class="notion-text notion-block-22cc030b24dd81fc8306eac70b2b5801">NVLink 是 NVIDIA 推出的一种高速互联技术，用于 GPU 之间以及 GPU 与 CPU 之间的直接连接。它提供了比 PCIe 更高的带宽和更低的延迟，特别适用于多 GPU 系统中的数据密集型任务，如深度学习训练。</div><h2 class="notion-h notion-h1 notion-h-indent-0 notion-block-22cc030b24dd81628b55eac5500b73d1" data-id="22cc030b24dd81628b55eac5500b73d1"><span><div id="22cc030b24dd81628b55eac5500b73d1" class="notion-header-anchor"></div><a class="notion-hash-link" href="#22cc030b24dd81628b55eac5500b73d1" title="多机多卡通信"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">多机多卡通信</span></span></h2><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-22cc030b24dd81f7b0e4e4ccf55c9fb2" data-id="22cc030b24dd81f7b0e4e4ccf55c9fb2"><span><div id="22cc030b24dd81f7b0e4e4ccf55c9fb2" class="notion-header-anchor"></div><a class="notion-hash-link" href="#22cc030b24dd81f7b0e4e4ccf55c9fb2" title="GPUDirect RDMA"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">GPUDirect RDMA</span></span></h3><div class="notion-text notion-block-22cc030b24dd815ea8b1f256c21499bb">GPUDirect RDMA 允许 GPU 直接与支持 RDMA 的网络适配器（如 InfiniBand 或 RoCE 网卡）进行数据传输，而无需通过 CPU 或系统内存。这使得多机多卡系统中的 GPU 之间能够进行高速、低延迟的直接通信，是构建大规模分布式深度学习集群的关键技术。</div><h3 class="notion-h notion-h2 notion-h-indent-1 notion-block-22cc030b24dd81ca9d53f961dd50bd5b" data-id="22cc030b24dd81ca9d53f961dd50bd5b"><span><div id="22cc030b24dd81ca9d53f961dd50bd5b" class="notion-header-anchor"></div><a class="notion-hash-link" href="#22cc030b24dd81ca9d53f961dd50bd5b" title="GPUDirect Storage"><svg viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M7.775 3.275a.75.75 0 001.06 1.06l1.25-1.25a2 2 0 112.83 2.83l-2.5 2.5a2 2 0 01-2.83 0 .75.75 0 00-1.06 1.06 3.5 3.5 0 004.95 0l2.5-2.5a3.5 3.5 0 00-4.95-4.95l-1.25 1.25zm-4.69 9.64a2 2 0 010-2.83l2.5-2.5a2 2 0 012.83 0 .75.75 0 001.06-1.06 3.5 3.5 0 00-4.95 0l-2.5 2.5a3.5 3.5 0 004.95 4.95l1.25-1.25a.75.75 0 00-1.06-1.06l-1.25 1.25a2 2 0 01-2.83 0z"></path></svg></a><span class="notion-h-title">GPUDirect Storage</span></span></h3><div class="notion-text notion-block-22cc030b24dd81dbafddc8b07615d653">GPUDirect Storage 允许 GPU 直接从存储设备（如 NVMe SSD）读取数据，绕过 CPU 和系统内存。这对于需要处理大量数据的应用（如数据分析、AI训练）来说，可以显著提高数据加载速度和整体性能。</div></main></div>]]></content:encoded>
        </item>
    </channel>
</rss>