�?.3章：Surface Shader执行流程详解

理解Surface Shader的执行流程对于优化性能和调试问题至关重要。本章将详细解析Surface Shader在渲染管线中的执行过程，包括前向渲染和延迟渲染的差异，以及各个阶段的具体工作原理�?

🎯 学习目标

通过本章学习，你将掌握：

Surface Shader在渲染管线中的完整执行流�?- 前向渲染和延迟渲染的执行差异
光照计算的时机和方式
多Pass渲染的执行顺�?- 性能优化的关键点

🔄 渲染管线概览

Cocos Creator渲染管线架构

graph TD
    A[Scene Objects] --> B[Culling]
    B --> C[Sorting]
    C --> D[Batching]
    D --> E[Vertex Processing]
    E --> F[Rasterization]
    F --> G[Fragment Processing]
    G --> H[Output Merger]
    H --> I[Frame Buffer]

Surface Shader在管线中的位�?

// 渲染管线中Surface Shader的调用时�?class RenderPipeline {
    executeRenderPass(objects: RenderObject[]) {
        // 1. 几何阶段
        for (let obj of objects) {
            this.executeVertexShader(obj);
        }
        
        // 2. 光栅化阶�?        this.rasterizeTriangles();
        
        // 3. 着色阶�?        for (let fragment of fragments) {
            this.executeFragmentShader(fragment);
        }
        
        // 4. 输出合并
        this.blendAndOutput();
    }
}

🏃‍♂�?顶点处理阶段

顶点着色器执行流程

// Surface Shader顶点处理的详细流�?CCProgram surface-vs %{
  precision highp float;
  
  // === 步骤1: 输入数据获取 ===
  #include <builtin/inputs/cc-position>
  #include <builtin/inputs/cc-normal>
  #include <builtin/inputs/cc-texcoord>
  
  vec4 vert() {
    // 步骤2: 初始化输入结�?    StandardVertInput input;
    CCVertInput(input);  // 从顶点缓冲区读取数据
    
    // 步骤3: 顶点变换
    vec4 localPos = input.position;
    vec4 worldPos = cc_matWorld * localPos;
    vec4 viewPos = cc_matView * worldPos;
    vec4 clipPos = cc_matProj * viewPos;
    
    // 步骤4: 法线变换
    vec3 worldNormal = normalize((cc_matWorldIT * vec4(input.normal, 0)).xyz);
    
    // 步骤5: 数据传递到片元着色器
    CC_TRANSFER_WORLDPOS(worldPos);
    CC_TRANSFER_WORLDNORMAL(worldNormal);
    CC_TRANSFER_UV(input.texCoord);
    
    // 步骤6: 返回裁剪空间位置
    return clipPos;
  }
}%

顶点变换详解

// 详细的变换矩阵应�?void performVertexTransforms(inout StandardVertInput input) {
    // 1. 本地空间 -> 世界空间
    vec4 worldPosition = cc_matWorld * input.position;
    vec3 worldNormal = normalize((cc_matWorldIT * vec4(input.normal, 0)).xyz);
    
    // 2. 世界空间 -> 视图空间
    vec4 viewPosition = cc_matView * worldPosition;
    vec3 viewNormal = normalize((cc_matViewInvTrans * vec4(worldNormal, 0)).xyz);
    
    // 3. 视图空间 -> 裁剪空间
    vec4 clipPosition = cc_matProj * viewPosition;
    
    // 4. 切线空间处理（如果需要）
    #if CC_USE_TANGENT
        vec3 worldTangent = normalize((cc_matWorld * vec4(input.tangent.xyz, 0)).xyz);
        vec3 worldBitangent = cross(worldNormal, worldTangent) * input.tangent.w;
        mat3 tbnMatrix = mat3(worldTangent, worldBitangent, worldNormal);
        CC_TRANSFER_TBN(tbnMatrix);
    #endif
}

🎨 片元处理阶段

片元着色器执行流程

CCProgram surface-fs %{
  precision mediump float;
  
  // === 步骤1: 接收插值数�?===
  #include <builtin/inputs/cc-worldpos>
  #include <builtin/inputs/cc-worldnormal>
  #include <builtin/inputs/cc-uv>
  
  vec4 frag() {
    // 步骤2: 材质数据采样和处�?    StandardSurface surface;
    surf(surface);  // 调用Surface函数
    
    // 步骤3: 光照计算
    vec4 finalColor = calculateLighting(surface);
    
    // 步骤4: 后处理效�?    finalColor = applyPostProcessing(finalColor);
    
    // 步骤5: 返回最终颜�?    return finalColor;
  }
  
  // Surface函数的执行细�?  void surf(out StandardSurface s) {
    // 2.1: 纹理采样
    vec2 uv = CC_GET_UV();
    vec4 albedoTex = texture(mainTexture, uv);
    vec3 normalTex = texture(normalMap, uv).rgb * 2.0 - 1.0;
    
    // 2.2: 材质属性计�?    s.albedo = albedoTex.rgb * mainColor.rgb;
    s.normal = normalize(CC_GET_WORLDNORMAL() + normalTex);
    s.roughness = roughness;
    s.metallic = metallic;
    s.alpha = albedoTex.a * mainColor.a;
  }
}%

光照计算流程

// 光照计算的详细实�?vec4 calculateLighting(StandardSurface surface) {
    vec3 worldPos = CC_GET_WORLDPOS();
    vec3 normal = surface.normal;
    vec3 viewDir = normalize(cc_cameraPos.xyz - worldPos);
    
    vec3 finalColor = vec3(0.0);
    
    // 1. 环境光照
    vec3 ambient = calculateAmbientLighting(surface);
    finalColor += ambient;
    
    // 2. 主光源光�?    vec3 mainLight = calculateMainLighting(surface, normal, viewDir, worldPos);
    finalColor += mainLight;
    
    // 3. 附加光源光照
    #if CC_ENABLE_ADDITIONAL_LIGHTS
        for (int i = 0; i < cc_additionalLightCount; i++) {
            vec3 additionalLight = calculateAdditionalLight(i, surface, normal, viewDir, worldPos);
            finalColor += additionalLight;
        }
    #endif
    
    // 4. 自发�?    finalColor += surface.emissive;
    
    return vec4(finalColor, surface.alpha);
}

// 主光源光照计�?vec3 calculateMainLighting(StandardSurface surface, vec3 normal, vec3 viewDir, vec3 worldPos) {
    vec3 lightDir = normalize(-cc_mainLitDir.xyz);
    vec3 lightColor = cc_mainLitColor.rgb * cc_mainLitColor.w;
    
    // Lambert漫反�?    float NdotL = max(dot(normal, lightDir), 0.0);
    vec3 diffuse = surface.albedo * lightColor * NdotL;
    
    // PBR镜面反射
    vec3 halfDir = normalize(lightDir + viewDir);
    float NdotH = max(dot(normal, halfDir), 0.0);
    float NdotV = max(dot(normal, viewDir), 0.0);
    
    vec3 specular = calculatePBRSpecular(surface.roughness, surface.metallic, 
                                        surface.albedo, NdotL, NdotV, NdotH);
    
    // 阴影衰减
    #if CC_RECEIVE_SHADOW
        float shadowAtten = CC_SHADOW_ATTEN();
        diffuse *= shadowAtten;
        specular *= shadowAtten;
    #endif
    
    return diffuse + specular;
}

🔀 前向渲染 vs 延迟渲染

前向渲染执行流程

graph TD
    A[Object 1] --> B[Vertex Shader]
    B --> C[Rasterization]
    C --> D[Fragment Shader + Lighting]
    D --> E[Blend to Frame Buffer]
    
    F[Object 2] --> G[Vertex Shader]
    G --> H[Rasterization]
    H --> I[Fragment Shader + Lighting]
    I --> J[Blend to Frame Buffer]
    
    K[Object N] --> L[Vertex Shader]
    L --> M[Rasterization]
    M --> N[Fragment Shader + Lighting]
    N --> O[Blend to Frame Buffer]

// 前向渲染中的光照计算
CCProgram forward-fs %{
    vec4 frag() {
        StandardSurface surface;
        surf(surface);
        
        // 在片元着色器中直接计算所有光�?        vec3 color = vec3(0.0);
        
        // 对每个光源进行光照计�?        for (int i = 0; i < lightCount; i++) {
            color += calculateLightContribution(surface, lights[i]);
        }
        
        return vec4(color, surface.alpha);
    }
}%

延迟渲染执行流程

graph TD
    A[All Objects] --> B[Geometry Pass]
    B --> C[G-Buffer]
    C --> D[Lighting Pass]
    D --> E[Final Frame Buffer]
    
    B1[Vertex Shader] --> B
    B2[Fragment Shader<br/>Output to G-Buffer] --> B
    
    D1[Screen Quad] --> D
    D2[Lighting Shader<br/>Read from G-Buffer] --> D

// 延迟渲染的几何Pass
CCProgram deferred-geometry-fs %{
    struct GBufferOutput {
        vec4 albedoAO;      // RGB: albedo, A: AO
        vec4 normalRoughness; // RGB: normal, A: roughness
        vec4 motionMetallic;  // RG: motion vector, B: metallic, A: unused
    };
    
    GBufferOutput frag() {
        StandardSurface surface;
        surf(surface);
        
        GBufferOutput output;
        output.albedoAO = vec4(surface.albedo, surface.ao);
        output.normalRoughness = vec4(surface.normal * 0.5 + 0.5, surface.roughness);
        output.motionMetallic = vec4(calculateMotionVector(), surface.metallic, 0.0);
        
        return output;
    }
}%

// 延迟渲染的光照Pass
CCProgram deferred-lighting-fs %{
    vec4 frag() {
        // 从G-Buffer读取数据
        vec4 albedoAO = texture(gBufferAlbedoAO, screenUV);
        vec4 normalRoughness = texture(gBufferNormalRoughness, screenUV);
        vec4 motionMetallic = texture(gBufferMotionMetallic, screenUV);
        
        // 重建Surface数据
        StandardSurface surface;
        surface.albedo = albedoAO.rgb;
        surface.ao = albedoAO.a;
        surface.normal = normalRoughness.rgb * 2.0 - 1.0;
        surface.roughness = normalRoughness.a;
        surface.metallic = motionMetallic.b;
        
        // 重建世界位置
        vec3 worldPos = reconstructWorldPosition(screenUV, depthTexture);
        
        // 执行光照计算
        vec3 finalColor = calculateAllLighting(surface, worldPos);
        
        return vec4(finalColor, 1.0);
    }
}%

🔄 多Pass渲染执行顺序

Pass执行顺序控制

CCEffect %{
  techniques:
  - name: multi-pass-demo
    passes:
    # Pass 0: 深度预Pass (Queue: 1000)
    - vert: depth-vs:vert
      frag: depth-fs:frag
      phase: depth-prepass
      rasterizerState:
        cullMode: back
      depthStencilState:
        depthTest: true
        depthWrite: true
      blendState:
        targets:
        - blend: false
          colorMask: 0  # 只写深度，不写颜�?    
    # Pass 1: 主渲染Pass (Queue: 2000)
    - vert: main-vs:vert
      frag: main-fs:frag
      phase: forward
      rasterizerState:
        cullMode: back
      depthStencilState:
        depthTest: true
        depthWrite: false
        depthFunc: equal
      blendState:
        targets:
        - blend: false
    
    # Pass 2: 后效果Pass (Queue: 3000)
    - vert: effect-vs:vert
      frag: effect-fs:frag
      phase: forward-add
      rasterizerState:
        cullMode: none
      depthStencilState:
        depthTest: true
        depthWrite: false
      blendState:
        targets:
        - blend: true
          blendSrc: one
          blendDst: one
}%

Pass间数据传�?

// Pass间通过纹理传递数�?CCProgram pass1-fs %{
    // 第一个Pass输出数据到纹�?    void main() {
        vec4 data = calculateSomeData();
        gl_FragColor = data;
    }
}%

CCProgram pass2-fs %{
    uniform sampler2D pass1Result;
    
    // 第二个Pass读取第一个Pass的结�?    void main() {
        vec4 previousData = texture(pass1Result, uv);
        vec4 finalResult = processData(previousData);
        gl_FragColor = finalResult;
    }
}%

�?性能优化关键�?

顶点着色器优化

// 优化前：复杂的顶点计�?vec4 vert_slow() {
    // 昂贵的数学运�?    float complexValue = pow(sin(a_position.x), 3.0) * exp(a_position.y);
    
    // 多次矩阵乘法
    vec4 pos1 = cc_matWorld * a_position;
    vec4 pos2 = cc_matView * pos1;
    vec4 pos3 = cc_matProj * pos2;
    
    return pos3;
}

// 优化后：简化的顶点计算
vec4 vert_fast() {
    // 预计算复杂值，或使用查找表
    float complexValue = texture(lookupTable, a_position.xy).r;
    
    // 合并矩阵乘法
    vec4 clipPos = cc_matViewProj * (cc_matWorld * a_position);
    
    return clipPos;
}

片元着色器优化

// 优化前：每个片元都采样多�?vec4 frag_slow() {
    // 多次纹理采样
    vec4 color1 = texture(tex1, uv);
    vec4 color2 = texture(tex1, uv + offset1);
    vec4 color3 = texture(tex1, uv + offset2);
    vec4 color4 = texture(tex1, uv + offset3);
    
    return (color1 + color2 + color3 + color4) * 0.25;
}

// 优化后：减少纹理采样，使用更高效的算�?vec4 frag_fast() {
    // 使用单次采样和数学运算替代多次采�?    vec4 baseColor = texture(tex1, uv);
    vec4 result = baseColor * somePrecomputedValue;
    
    return result;
}

光照计算优化

// 光照计算的性能优化策略
vec3 optimizedLighting(StandardSurface surface) {
    // 1. 早期退�?    if (surface.alpha < 0.01) discard;
    
    // 2. 距离剔除
    float lightDistance = length(lightPos - worldPos);
    if (lightDistance > lightRange) return vec3(0.0);
    
    // 3. LOD系统
    int lightingLOD = determineLightingLOD(cameraDistance);
    if (lightingLOD == 0) {
        return calculateSimpleLighting(surface);
    } else {
        return calculateFullPBRLighting(surface);
    }
}

📊 性能分析和调�?

渲染统计收集

class RenderStats {
    vertexShaderTime: number = 0;
    fragmentShaderTime: number = 0;
    drawCalls: number = 0;
    trianglesRendered: number = 0;
    
    measureShaderPerformance() {
        // GPU时间查询
        const query = gl.createQuery();
        gl.beginQuery(gl.TIME_ELAPSED, query);
        
        // 执行渲染
        this.executeRenderPass();
        
        gl.endQuery(gl.TIME_ELAPSED);
        
        // 获取结果
        gl.getQueryParameter(query, gl.QUERY_RESULT);
    }
}

瓶颈识别

// 通过条件编译识别性能瓶颈
#define PERFORMANCE_TEST 1

#if PERFORMANCE_TEST == 1
    // 简化版�?- 测试是否是复杂计算导致的瓶颈
    vec4 frag() {
        return vec4(1.0, 0.0, 0.0, 1.0);  // 纯红�?    }
#elif PERFORMANCE_TEST == 2
    // 仅纹理采�?- 测试是否是纹理带宽瓶�?    vec4 frag() {
        return texture(mainTexture, uv);
    }
#elif PERFORMANCE_TEST == 3
    // 仅光照计�?- 测试是否是光照计算瓶�?    vec4 frag() {
        return vec4(calculateLighting(), 1.0);
    }
#else
    // 完整版本
    vec4 frag() {
        return fullRenderingPipeline();
    }
#endif

💡 最佳实�?

执行流程优化

合理安排Pass顺序: 深度预Pass �?不透明物体 �?天空�?�?透明物体
*批处理优�?: 相同材质的物体合并渲�?3. 状态变更最小化: 减少渲染状态切换次�?4. early-Z优化: 利用深度测试减少overdraw

内存访问优化

// 优化内存访问模式
// 错误：分散的内存访问
vec4 badMemoryAccess() {
    vec4 c1 = texture(tex, uv);
    vec4 c2 = texture(otherTex, uv);  // 缓存未命�?    vec4 c3 = texture(tex, uv + offset);  // 又回到第一个纹�?    return c1 + c2 + c3;
}

// 正确：连续的内存访问
vec4 goodMemoryAccess() {
    vec4 c1 = texture(tex, uv);
    vec4 c3 = texture(tex, uv + offset);  // 连续访问同一纹理
    vec4 c2 = texture(otherTex, uv);
    return c1 + c2 + c3;
}

理解Surface Shader的执行流程是优化渲染性能的基础。通过掌握这些知识，你可以更好地设计着色器，避免性能瓶颈，创建高效的渲染效果�?

下一步学习
第5.4章：Surface Shader Include机制详解 - Include机制详解
第6.1章：光照模型详解 - 光照模型详解