第11.3章：着色器优化技术详解

掌握着色器优化技术是提升游戏性能的关键。本教程将深入介绍各种着色器优化策略，帮助你编写高效的GPU代码。

🎯 学习目标

掌握GPU架构和着色器执行原理
学会分析和优化着色器性能瓶颈
了解各种着色器优化技术
掌握移动端特有的优化策略

📋 前置知识

熟悉着色器编程基础
理解GPU渲染管线
了解基本的计算机图形学概念

🔧 GPU架构基础

GPU执行模型

// GPU并行执行示例
CCProgram gpu_execution_model %{
    // GPU以Warp/Wavefront为单位执行
    // 通常32个线程同时执行相同指令
    
    void main() {
        // 好的做法：所有线程执行相同代码路径
        vec4 color = texture(mainTexture, v_uv);
        color.rgb *= 2.0;
        
        // 坏的做法：分支导致执行分歧
        if (v_uv.x > 0.5) {
            color.rgb *= 2.0;  // 一半线程执行这里
        } else {
            color.rgb *= 0.5;  // 另一半线程执行这里
        }
        
        fragColor = color;
    }
}%

内存层次结构

// GPU内存访问性能对比
interface GPUMemoryHierarchy {
    registers: {
        latency: '0 cycles',
        bandwidth: 'Very High',
        size: 'Very Small',
        usage: '局部变量'
    };
    
    constantMemory: {
        latency: '1-2 cycles (cached)',
        bandwidth: 'High',
        size: 'Medium', 
        usage: 'Uniform变量'
    };
    
    textureMemory: {
        latency: '100-200 cycles',
        bandwidth: 'Medium',
        size: 'Large',
        usage: '纹理采样'
    };
    
    globalMemory: {
        latency: '200-400 cycles',
        bandwidth: 'Low', 
        size: 'Very Large',
        usage: '顶点缓冲、帧缓冲'
    };
}

计算优化技术

1. 减少复杂数学运算

// 数学运算优化对比
CCProgram math_optimization %{
    // 未优化版本
    vec3 slowVersion(vec3 input) {
        float result = pow(input.x, 2.0);           // 昂贵的幂运算
        result += sqrt(input.y);                    // 昂贵的开方运算
        result *= sin(input.z * 3.14159);          // 昂贵的三角函数
        return vec3(result);
    }
    
    // 优化版本
    vec3 fastVersion(vec3 input) {
        float result = input.x * input.x;           // 使用乘法替代平方
        result += pow(input.y, 0.5);               // 或使用查找表
        result *= sinLUT(input.z);                 // 使用预计算的查找表
        return vec3(result);
    }
    
    // 查找表实现
    uniform sampler2D sinLUT;
    float sinLUT(float x) {
        float normalized = x / (2.0 * 3.14159);   // 归一化到[0,1]
        return texture(sinLUT, vec2(normalized, 0.5)).r;
    }
}%

2. 向量化操�?

// 向量化优�?CCProgram vectorization %{
    // �?标量操作（慢�?    void scalarVersion() {
        float r = texture(tex, uv).r * color.r;
        float g = texture(tex, uv).g * color.g; 
        float b = texture(tex, uv).b * color.b;
        float a = texture(tex, uv).a * color.a;
        fragColor = vec4(r, g, b, a);
    }
    
    // �?向量操作（快�?    void vectorVersion() {
        vec4 texColor = texture(tex, uv);
        fragColor = texColor * color;           // 单个向量操作
    }
    
    // �?SIMD友好的操�?    void simdFriendly() {
        vec4 a = texture(texA, uv);
        vec4 b = texture(texB, uv);
        vec4 c = texture(texC, uv);
        
        // 多个向量同时计算
        vec4 result = a * b + c;               // Fused Multiply-Add
        fragColor = result;
    }
}%

🖼�?纹理优化技�?

1. 纹理采样优化

// 纹理采样优化
CCProgram texture_optimization %{
    // �?多次重复采样
    void redundantSampling() {
        vec4 color1 = texture(mainTex, uv);
        vec4 color2 = texture(mainTex, uv + offset1);  // 重复采样
        vec4 color3 = texture(mainTex, uv + offset2);
        
        fragColor = (color1 + color2 + color3) / 3.0;
    }
    
    // �?减少采样次数
    void optimizedSampling() {
        // 使用双线性插值减少采�?        vec4 color = texture(mainTex, uv);
        vec4 neighbor = texture(mainTex, uv + offset);
        
        fragColor = mix(color, neighbor, blendFactor);
    }
    
    // �?合并纹理采样
    void packedTextures() {
        // 将多个单通道纹理打包到一个RGBA纹理�?        vec4 packed = texture(packedTex, uv);
        float roughness = packed.r;
        float metallic = packed.g; 
        float ao = packed.b;
        float height = packed.a;
    }
}%

2. 纹理压缩和格式选择

// 纹理格式优化
class TextureOptimizer {
    public selectOptimalFormat(usage: TextureUsage): TextureFormat {
        switch (usage) {
            case 'albedo':
                return sys.platform === sys.Platform.MOBILE ? 
                    'ETC2_RGB' : 'BC1_RGB';
                    
            case 'normal':
                return sys.platform === sys.Platform.MOBILE ?
                    'ETC2_RG11' : 'BC5_RG';  // 只存储XY，重构Z
                    
            case 'roughnessMetallicAO':
                return 'RGB8';  // 打包存储
                
            case 'heightmap':
                return 'R8';    // 单通道足够
                
            default:
                return 'RGBA8';
        }
    }
    
    public optimizeTextureSize(originalSize: number, usage: TextureUsage): number {
        const maxSizes = {
            'ui': 2048,
            'character': 1024, 
            'environment': 512,
            'effects': 256
        };
        
        return Math.min(originalSize, maxSizes[usage] || 512);
    }
}

3. Mipmap优化

// Mipmap优化技�?CCProgram mipmap_optimization %{
    // 手动Mipmap级别选择
    float calculateMipmapLevel(vec2 uv, vec2 textureSize) {
        vec2 dx = dFdx(uv * textureSize);
        vec2 dy = dFdy(uv * textureSize);
        float maxDelta = max(dot(dx, dx), dot(dy, dy));
        return 0.5 * log2(maxDelta);
    }
    
    // 优化的纹理采�?    vec4 optimizedTextureSample(sampler2D tex, vec2 uv) {
        float level = calculateMipmapLevel(uv, textureSize);
        return textureLod(tex, uv, level);
    }
    
    // 各向异性过滤优�?    vec4 anisotropicSample(sampler2D tex, vec2 uv) {
        // 计算各向异性比�?        vec2 dx = dFdx(uv * textureSize);
        vec2 dy = dFdy(uv * textureSize);
        
        float maxAniso = max(length(dx), length(dy));
        float minAniso = min(length(dx), length(dy));
        float ratio = maxAniso / minAniso;
        
        // 限制各向异性级别以提高性能
        ratio = min(ratio, 4.0);
        
        return texture(tex, uv);  // GPU自动处理各向异�?    }
}%

🔀 分支优化技�?

1. 避免动态分�?

// 分支优化对比
CCProgram branch_optimization %{
    // �?动态分支（GPU执行效率低）
    vec3 dynamicBranch(vec3 color, float condition) {
        if (condition > 0.5) {
            return color * 2.0;      // 分支A
        } else {
            return color * 0.5;      // 分支B
        }
    }
    
    // �?使用step函数消除分支
    vec3 eliminateBranch(vec3 color, float condition) {
        float factor = mix(0.5, 2.0, step(0.5, condition));
        return color * factor;
    }
    
    // �?使用lerp消除分支
    vec3 lerpBranch(vec3 color, float condition) {
        vec3 resultA = color * 2.0;
        vec3 resultB = color * 0.5;
        return mix(resultB, resultA, step(0.5, condition));
    }
}%

2. 静态分支优�?

// 静态分支和宏定�?CCProgram static_branches %{
    // 使用宏定义创建静态分�?    #if defined(ENABLE_NORMAL_MAPPING)
        vec3 calculateNormal() {
            vec3 normal = texture(normalTexture, v_uv).xyz * 2.0 - 1.0;
            return normalize(normal);
        }
    #else
        vec3 calculateNormal() {
            return normalize(v_worldNormal);
        }
    #endif
    
    // 特性级别静态分�?    #if FEATURE_LEVEL >= 3
        // 高端设备：完整PBR
        vec3 pbrLighting() {
            return calculateFullPBR();
        }
    #elif FEATURE_LEVEL >= 2  
        // 中端设备：简化PBR
        vec3 pbrLighting() {
            return calculateSimplifiedPBR();
        }
    #else
        // 低端设备：Blinn-Phong
        vec3 pbrLighting() {
            return calculateBlinnPhong();
        }
    #endif
}%

3. 分支预测优化

// 动态着色器变体管理
class ShaderVariantManager {
    private variants: Map<string, Shader> = new Map();
    
    public getOptimalShader(context: RenderContext): Shader {
        const key = this.generateVariantKey(context);
        
        if (!this.variants.has(key)) {
            this.variants.set(key, this.compileVariant(context));
        }
        
        return this.variants.get(key)!;
    }
    
    private generateVariantKey(context: RenderContext): string {
        const features = [];
        
        if (context.hasNormalMap) features.push('NORMAL_MAP');
        if (context.lightCount > 4) features.push('MANY_LIGHTS');
        if (context.enableShadows) features.push('SHADOWS');
        if (context.enableSSAO) features.push('SSAO');
        
        return features.join('|');
    }
    
    private compileVariant(context: RenderContext): Shader {
        const defines = this.generateDefines(context);
        return this.shaderCompiler.compile(this.baseShader, defines);
    }
}

📊 内存带宽优化

1. 减少内存访问

// 内存访问优化
CCProgram memory_optimization %{
    // �?重复的内存访�?    void redundantAccess() {
        vec3 normal = normalize(v_worldNormal);
        vec3 lightDir = normalize(lightPosition - v_worldPos);
        vec3 viewDir = normalize(cameraPosition - v_worldPos);
        
        // v_worldPos被多次访�?        float dist1 = distance(lightPosition, v_worldPos);
        float dist2 = distance(cameraPosition, v_worldPos);
    }
    
    // �?缓存频繁访问的�?    void cachedAccess() {
        vec3 worldPos = v_worldPos;  // 缓存到寄存器
        vec3 normal = normalize(v_worldNormal);
        
        vec3 lightDir = lightPosition - worldPos;
        vec3 viewDir = cameraPosition - worldPos;
        
        float lightDist = length(lightDir);
        float viewDist = length(viewDir);
        
        lightDir /= lightDist;  // 复用长度计算结果
        viewDir /= viewDist;
    }
}%

2. 数据打包技�?

// 数据打包优化
CCProgram data_packing %{
    // �?未打包的数据
    struct UnpackedData {
        float roughness;    // 4 bytes
        float metallic;     // 4 bytes  
        float ao;          // 4 bytes
        float height;      // 4 bytes
        // 总计: 16 bytes
    };
    
    // �?打包的数�?    struct PackedData {
        vec4 packed;       // 4 bytes
        // R: roughness, G: metallic, B: ao, A: height
    };
    
    // 法线向量打包
    vec2 packNormal(vec3 normal) {
        // 球面坐标打包，节省一个分�?        return normal.xy / (normal.z + 1.0);
    }
    
    vec3 unpackNormal(vec2 packed) {
        vec2 f = packed;
        float f2 = dot(f, f);
        float g = sqrt(1.0 - f2 / 4.0);
        return vec3(f * g, 1.0 - f2 / 2.0);
    }
    
    // 颜色打包到更少位�?    uint packColor(vec3 color) {
        uvec3 c = uvec3(color * 255.0);
        return (c.r << 16) | (c.g << 8) | c.b;  // RGB888
    }
}%

🎯 LOD和可见性优�?

1. 着色器LOD系统

// 着色器LOD实现
CCProgram shader_lod %{
    uniform float distanceToCamera;
    uniform float lodBias;
    
    // 计算LOD级别
    float calculateShaderLOD() {
        float distance = distanceToCamera;
        float lod = log2(distance) + lodBias;
        return clamp(lod, 0.0, 3.0);
    }
    
    // 基于LOD的着色器选择
    vec3 calculateLighting() {
        float lod = calculateShaderLOD();
        
        if (lod < 1.0) {
            // LOD 0: 完整PBR光照
            return calculateFullPBR();
        } else if (lod < 2.0) {
            // LOD 1: 简化PBR
            return calculateSimplifiedPBR();
        } else if (lod < 3.0) {
            // LOD 2: Blinn-Phong
            return calculateBlinnPhong();
        } else {
            // LOD 3: 环境光只
            return calculateAmbientOnly();
        }
    }
}%

2. 动态质量调�?

// 自适应质量系统
@ccclass('AdaptiveQualityManager')
export class AdaptiveQualityManager extends Component {
    @property
    targetFrameTime: number = 16.67; // 60 FPS
    
    @property
    qualityLevel: number = 2; // 0-3
    
    private frameTimeHistory: number[] = [];
    
    public update() {
        this.updateFrameTimeHistory();
        this.adjustQuality();
    }
    
    private adjustQuality() {
        const avgFrameTime = this.getAverageFrameTime();
        
        if (avgFrameTime > this.targetFrameTime * 1.2) {
            // 性能不足，降低质�?            this.qualityLevel = Math.max(0, this.qualityLevel - 1);
            this.applyQualitySettings();
        } else if (avgFrameTime < this.targetFrameTime * 0.8) {
            // 性能充足，提高质�?            this.qualityLevel = Math.min(3, this.qualityLevel + 1);
            this.applyQualitySettings();
        }
    }
    
    private applyQualitySettings() {
        const settings = this.getQualitySettings(this.qualityLevel);
        
        // 应用着色器LOD
        rendering.setGlobalMacro('SHADER_LOD', this.qualityLevel);
        
        // 调整光源数量
        rendering.setGlobalInt('MAX_LIGHTS', settings.maxLights);
        
        // 调整阴影质量
        rendering.setGlobalFloat('SHADOW_DISTANCE', settings.shadowDistance);
    }
}