Skynet 最佳实践与生产部署

全面总结 Skynet 开发的最佳实践,涵盖代码规范、错误处理、安全防护、性能优化、部署运维和故障排查等关键主题

本教程总结了 Skynet 游戏服务器开发的最佳实践,帮助开发者构建高质量、可维护、安全稳定的生产级应用。

代码组织规范

项目目录结构

my-game-server/
├── config/                 # 配置文件
│   ├── dev.conf           # 开发环境配置
│   ├── prod.conf          # 生产环境配置
│   └── cluster.conf       # 集群配置
├── service/               # 服务脚本
│   ├── main.lua          # 主服务(启动入口)
│   ├── gate/             # 网关相关服务
│   │   ├── wsgate.lua
│   │   └── agent.lua
│   ├── logic/            # 业务逻辑服务
│   │   ├── auth.lua
│   │   ├── player.lua
│   │   └── battle.lua
│   └── data/             # 数据服务
│       ├── mysql.lua
│       └── redis.lua
├── lualib/               # Lua 库
│   ├── protocol.lua      # 协议定义
│   ├── utils.lua         # 工具函数
│   └── logger.lua        # 日志工具
├── proto/                # 协议文件
│   └── message.proto
├── sql/                  # 数据库脚本
│   ├── schema.sql
│   └── migrations/
├── scripts/              # 运维脚本
│   ├── start.sh
│   ├── stop.sh
│   └── backup.sh
├── logs/                 # 日志目录
├── tests/                # 测试代码
└── docs/                 # 文档

服务命名规范

-- 好的命名
service/user_service.lua      -- 用户服务
service/battle_manager.lua    -- 战斗管理器
service/chat_room.lua         -- 聊天室

-- 避免的命名
service/srv1.lua             -- 无意义
service/handler.lua          -- 太泛化
service/test.lua             -- 测试代码不应放在 service 目录

模块组织

-- lualib/player/
--   ├── init.lua          -- 模块入口
--   ├── attributes.lua    -- 属性系统
--   ├── inventory.lua     -- 背包系统
--   └── skills.lua        -- 技能系统

-- lualib/player/init.lua
local M = {}

M.attributes = require("player.attributes")
M.inventory = require("player.inventory")
M.skills = require("player.skills")

function M.new(user_id)
    return {
        user_id = user_id,
        attrs = M.attributes.new(),
        inv = M.inventory.new(),
        skills = M.skills.new(),
    }
end

return M

错误处理

统一错误码系统

-- lualib/error_codes.lua
local ErrorCodes = {
    -- 通用错误 (1-999)
    SUCCESS = 0,
    UNKNOWN_ERROR = 1,
    INVALID_PARAMS = 2,
    PERMISSION_DENIED = 3,
    RESOURCE_NOT_FOUND = 4,
    RESOURCE_ALREADY_EXISTS = 5,
    
    -- 认证相关 (1000-1999)
    AUTH_FAILED = 1000,
    TOKEN_EXPIRED = 1001,
    TOKEN_INVALID = 1002,
    USER_NOT_FOUND = 1003,
    PASSWORD_WRONG = 1004,
    
    -- 游戏逻辑 (2000-2999)
    NOT_IN_ROOM = 2000,
    ROOM_FULL = 2001,
    ROOM_NOT_FOUND = 2002,
    BATTLE_NOT_STARTED = 2003,
    
    -- 数据相关 (3000-3999)
    DB_ERROR = 3000,
    CACHE_ERROR = 3001,
    INSUFFICIENT_GOLD = 3002,
    ITEM_NOT_FOUND = 3003,
}

-- 错误描述
local ErrorMessages = {
    [ErrorCodes.SUCCESS] = "成功",
    [ErrorCodes.UNKNOWN_ERROR] = "未知错误",
    [ErrorCodes.INVALID_PARAMS] = "参数无效",
    [ErrorCodes.AUTH_FAILED] = "认证失败",
    -- ...
}

local M = {
    codes = ErrorCodes,
    messages = ErrorMessages,
}

function M.error(code, extra_msg)
    local msg = ErrorMessages[code] or "未知错误"
    if extra_msg then
        msg = msg .. ": " .. extra_msg
    end
    return {code = code, message = msg}
end

function M.ok(data)
    return {code = ErrorCodes.SUCCESS, data = data}
end

return M

防御性编程

-- 服务模板:包含完整的错误处理
local skynet = require "skynet"
local ErrorCode = require "error_codes"

local CMD = {}

function CMD.some_command(param1, param2)
    -- 1. 参数验证
    if not param1 or type(param1) ~= "number" then
        return ErrorCode.error(ErrorCode.codes.INVALID_PARAMS, "param1")
    end
    
    -- 2. 业务逻辑(使用 pcall 保护)
    local ok, result = pcall(function()
        -- 实际业务逻辑
        return do_something(param1, param2)
    end)
    
    if not ok then
        skynet.error("业务逻辑错误:", result)
        return ErrorCode.error(ErrorCode.codes.UNKNOWN_ERROR, result)
    end
    
    -- 3. 返回结果
    return ErrorCode.ok(result)
end

-- 消息分发器(带错误处理)
skynet.start(function()
    skynet.dispatch("lua", function(session, source, cmd, ...)
        local f = CMD[cmd]
        if not f then
            skynet.error("未知命令:", cmd)
            if session ~= 0 then
                skynet.ret(skynet.pack(
                    ErrorCode.error(ErrorCode.codes.UNKNOWN_ERROR, "unknown cmd")
                ))
            end
            return
        end
        
        -- 执行命令(捕获所有异常)
        local ok, result = pcall(f, ...)
        
        if session ~= 0 then
            if ok then
                skynet.ret(skynet.pack(result))
            else
                skynet.error(string.format("命令 %s 执行失败: %s", cmd, result))
                skynet.ret(skynet.pack(
                    ErrorCode.error(ErrorCode.codes.UNKNOWN_ERROR, result)
                ))
            end
        end
    end)
end)

超时处理

-- lualib/timeout.lua
local skynet = require "skynet"

local M = {}

-- 带超时的调用
function M.call(timeout_ms, service, cmd, ...)
    local co = coroutine.running()
    local timeout_flag = false
    local result = nil
    
    -- 设置超时定时器
    local timer_id = skynet.timeout(timeout_ms, function()
        timeout_flag = true
        skynet.wakeup(co)
    end)
    
    -- 异步调用
    skynet.fork(function()
        local ok, res = pcall(skynet.call, service, "lua", cmd, ...)
        if not timeout_flag then
            result = {ok = ok, res = res}
            skynet.wakeup(co)
        end
    end)
    
    -- 等待结果或超时
    skynet.wait(co)
    
    -- 检查是否超时
    if timeout_flag then
        return false, "timeout"
    end
    
    -- 取消定时器
    skynet.cancel(timer_id)
    
    -- 返回结果
    if result.ok then
        return true, result.res
    else
        return false, result.res
    end
end

return M

-- 使用示例
local Timeout = require "timeout"

local ok, result = Timeout.call(5000, ".db_service", "query", sql)
if not ok then
    skynet.error("数据库查询超时或失败:", result)
    return
end

安全防护

输入验证

-- lualib/validator.lua
local M = {}

function M.is_string(value, min_len, max_len)
    if type(value) ~= "string" then
        return false, "必须是字符串"
    end
    local len = #value
    if min_len and len < min_len then
        return false, string.format("长度不能小于 %d", min_len)
    end
    if max_len and len > max_len then
        return false, string.format("长度不能大于 %d", max_len)
    end
    return true
end

function M.is_number(value, min, max)
    if type(value) ~= "number" then
        return false, "必须是数字"
    end
    if min and value < min then
        return false, string.format("不能小于 %d", min)
    end
    if max and value > max then
        return false, string.format("不能大于 %d", max)
    end
    return true
end

function M.is_integer(value, min, max)
    local ok, err = M.is_number(value, min, max)
    if not ok then return false, err end
    if value ~= math.floor(value) then
        return false, "必须是整数"
    end
    return true
end

function M.is_username(value)
    local ok, err = M.is_string(value, 3, 20)
    if not ok then return false, err end
    
    -- 只允许字母、数字、下划线
    if not string.match(value, "^[a-zA-Z0-9_]+$") then
        return false, "只能包含字母、数字和下划线"
    end
    
    return true
end

function M.is_password(value)
    local ok, err = M.is_string(value, 6, 32)
    if not ok then return false, err end
    
    -- 至少包含一个字母和一个数字
    if not string.match(value, "[a-zA-Z]") then
        return false, "必须包含字母"
    end
    if not string.match(value, "[0-9]") then
        return false, "必须包含数字"
    end
    
    return true
end

function M.is_email(value)
    if type(value) ~= "string" then
        return false, "必须是字符串"
    end
    
    local pattern = "^[%w%.%-_]+@[%w%.%-]+%.[%a]+$"
    if not string.match(value, pattern) then
        return false, "邮箱格式不正确"
    end
    
    return true
end

-- 表验证器
function M.validate_table(data, schema)
    for field, rules in pairs(schema) do
        local value = data[field]
        
        -- 必填检查
        if rules.required and (value == nil or value == "") then
            return false, string.format("字段 %s 不能为空", field)
        end
        
        -- 类型检查
        if value ~= nil then
            if rules.type == "string" then
                local ok, err = M.is_string(value, rules.min, rules.max)
                if not ok then
                    return false, string.format("字段 %s: %s", field, err)
                end
            elseif rules.type == "number" then
                local ok, err = M.is_number(value, rules.min, rules.max)
                if not ok then
                    return false, string.format("字段 %s: %s", field, err)
                end
            elseif rules.type == "integer" then
                local ok, err = M.is_integer(value, rules.min, rules.max)
                if not ok then
                    return false, string.format("字段 %s: %s", field, err)
                end
            end
        end
    end
    
    return true
end

return M

-- 使用示例
local Validator = require "validator"

local schema = {
    username = {type = "string", required = true, min = 3, max = 20},
    password = {type = "string", required = true, min = 6, max = 32},
    email = {type = "string", required = false},
    age = {type = "integer", required = false, min = 0, max = 150},
}

local ok, err = Validator.validate_table(user_data, schema)
if not ok then
    return ErrorCode.error(ErrorCode.codes.INVALID_PARAMS, err)
end

防止 SQL 注入

-- lualib/db_helper.lua
local skynet = require "skynet"

local M = {}

-- 安全的参数化查询
function M.query(sql, params)
    -- 使用参数化查询,避免字符串拼接
    return skynet.call(".mysql", "lua", "query", sql, params)
end

-- 安全的插入
function M.insert(table_name, data)
    local fields = {}
    local placeholders = {}
    local values = {}
    
    for field, value in pairs(data) do
        -- 字段名白名单验证
        if not is_valid_field_name(field) then
            error("无效的字段名: " .. field)
        end
        
        fields[#fields + 1] = field
        placeholders[#placeholders + 1] = "?"
        values[#values + 1] = value
    end
    
    local sql = string.format(
        "INSERT INTO %s (%s) VALUES (%s)",
        table_name,
        table.concat(fields, ", "),
        table.concat(placeholders, ", ")
    )
    
    return M.query(sql, values)
end

-- 字段名验证(白名单)
local valid_fields = {
    -- 用户表
    ["users.id"] = true,
    ["users.username"] = true,
    ["users.password_hash"] = true,
    ["users.email"] = true,
    -- ...
}

function is_valid_field_name(field)
    -- 简单的字段名验证:只允许字母、数字、下划线
    return string.match(field, "^[a-zA-Z0-9_%.]+$") ~= nil
end

return M

防止 XSS 和消息注入

-- lualib/security.lua
local M = {}

-- HTML 转义(防止 XSS)
function M.escape_html(str)
    if type(str) ~= "string" then
        return str
    end
    
    local escape_map = {
        ["&"] = "&amp;",
        ["<"] = "&lt;",
        [">"] = "&gt;",
        ['"'] = "&quot;",
        ["'"] = "&#39;",
    }
    
    return string.gsub(str, "[&<>\"']", escape_map)
end

-- 消息内容过滤
function M.filter_message(msg, max_length)
    if type(msg) ~= "string" then
        return ""
    end
    
    -- 限制长度
    if max_length and #msg > max_length then
        msg = string.sub(msg, 1, max_length)
    end
    
    -- 移除控制字符
    msg = string.gsub(msg, "[\0-\31\127]", "")
    
    -- HTML 转义
    msg = M.escape_html(msg)
    
    return msg
end

-- 敏感词过滤
local sensitive_words = {
    "脏话1",
    "脏话2",
    -- 从文件加载
}

function M.filter_sensitive(msg)
    for _, word in ipairs(sensitive_words) do
        msg = string.gsub(msg, word, string.rep("*", #word))
    end
    return msg
end

-- URL 验证(防止 SSRF)
function M.is_safe_url(url)
    if type(url) ~= "string" then
        return false
    end
    
    -- 只允许 http/https
    if not string.match(url, "^https?://") then
        return false
    end
    
    -- 禁止内网地址
    local blocked_patterns = {
        "^http://localhost",
        "^http://127%.0%.0%.1",
        "^http://10%.",
        "^http://172%.(1[6-9]|2[0-9]|3[0-1])%.",
        "^http://192%.168%.",
    }
    
    for _, pattern in ipairs(blocked_patterns) do
        if string.match(url, pattern) then
            return false
        end
    end
    
    return true
end

return M

频率限制

-- lualib/rate_limiter.lua
local skynet = require "skynet"

local RateLimiter = {}
RateLimiter.__index = RateLimiter

function RateLimiter.new(max_requests, time_window)
    return setmetatable({
        max_requests = max_requests,
        time_window = time_window,
        requests = {},  -- key -> {timestamps}
    }, RateLimiter)
end

function RateLimiter:check(key)
    local now = skynet.time()
    local window_start = now - self.time_window
    
    -- 清理过期记录
    local timestamps = self.requests[key] or {}
    local valid_timestamps = {}
    
    for _, ts in ipairs(timestamps) do
        if ts > window_start then
            valid_timestamps[#valid_timestamps + 1] = ts
        end
    end
    
    -- 检查是否超限
    if #valid_timestamps >= self.max_requests then
        self.requests[key] = valid_timestamps
        return false, string.format(
            "请求过于频繁,%d 秒内最多 %d 次",
            self.time_window, self.max_requests
        )
    end
    
    -- 记录本次请求
    valid_timestamps[#valid_timestamps + 1] = now
    self.requests[key] = valid_timestamps
    
    return true
end

-- 全局频率限制器实例
local limiters = {
    login = RateLimiter.new(5, 60),      -- 登录:每分钟 5 次
    register = RateLimiter.new(3, 3600), -- 注册:每小时 3 次
    chat = RateLimiter.new(20, 60),      -- 聊天:每分钟 20 条
    api = RateLimiter.new(100, 60),      -- API:每分钟 100 次
}

local M = {
    limiters = limiters,
}

function M.check(action, key)
    local limiter = limiters[action]
    if not limiter then
        return true  -- 未配置的 action 不限制
    end
    
    return limiter:check(key)
end

return M

-- 使用示例
local RateLimiter = require "rate_limiter"

function CMD.login(username, password, client_ip)
    -- 检查 IP 登录频率
    local ok, err = RateLimiter.check("login", client_ip)
    if not ok then
        return ErrorCode.error(ErrorCode.codes.TOO_MANY_REQUESTS, err)
    end
    
    -- 继续登录逻辑...
end

性能优化

内存管理

-- lualib/memory_pool.lua
local M = {}

-- 对象池
local ObjectPool = {}
ObjectPool.__index = ObjectPool

function ObjectPool.new(create_func, reset_func, initial_size)
    local pool = setmetatable({
        create = create_func,
        reset = reset_func,
        objects = {},
    }, ObjectPool)
    
    -- 预创建对象
    for i = 1, initial_size or 10 do
        pool.objects[#pool.objects + 1] = create_func()
    end
    
    return pool
end

function ObjectPool:acquire()
    if #self.objects > 0 then
        return table.remove(self.objects)
    end
    return self.create()
end

function ObjectPool:release(obj)
    if self.reset then
        self.reset(obj)
    end
    self.objects[#self.objects + 1] = obj
end

M.ObjectPool = ObjectPool

-- 使用示例:子弹对象池
local BulletPool = ObjectPool.new(
    function()
        return {x = 0, y = 0, vx = 0, vy = 0, damage = 0}
    end,
    function(bullet)
        bullet.x = 0
        bullet.y = 0
        bullet.vx = 0
        bullet.vy = 0
        bullet.damage = 0
    end,
    100  -- 预创建 100 个
)

-- 获取子弹
local bullet = BulletPool:acquire()
bullet.x = 100
bullet.y = 200

-- 使用完毕后回收
BulletPool:release(bullet)

return M

缓存策略

-- lualib/cache.lua
local skynet = require "skynet"

local Cache = {}
Cache.__index = Cache

function Cache.new(ttl, max_size)
    return setmetatable({
        ttl = ttl or 300,           -- 默认 5 分钟
        max_size = max_size or 1000,
        data = {},
        access_time = {},
    }, Cache)
end

function Cache:get(key)
    local value = self.data[key]
    if value == nil then
        return nil
    end
    
    -- 检查是否过期
    local now = skynet.time()
    if now - self.access_time[key] > self.ttl then
        self.data[key] = nil
        self.access_time[key] = nil
        return nil
    end
    
    -- 更新访问时间
    self.access_time[key] = now
    return value
end

function Cache:set(key, value)
    -- 检查大小限制
    if self.max_size then
        local count = 0
        for _ in pairs(self.data) do
            count = count + 1
        end
        
        -- 如果满了,删除最老的
        if count >= self.max_size and not self.data[key] then
            local oldest_key = nil
            local oldest_time = math.huge
            
            for k, t in pairs(self.access_time) do
                if t < oldest_time then
                    oldest_time = t
                    oldest_key = k
                end
            end
            
            if oldest_key then
                self.data[oldest_key] = nil
                self.access_time[oldest_key] = nil
            end
        end
    end
    
    self.data[key] = value
    self.access_time[key] = skynet.time()
end

function Cache:delete(key)
    self.data[key] = nil
    self.access_time[key] = nil
end

function Cache:clear()
    self.data = {}
    self.access_time = {}
end

-- 多级缓存
local M = {}

-- 用户缓存(内存 + Redis)
local user_cache = Cache.new(300, 10000)

function M.get_user(user_id)
    -- 1. 先查内存缓存
    local user = user_cache:get(user_id)
    if user then
        return user
    end
    
    -- 2. 查 Redis
    user = skynet.call(".redis", "lua", "hgetall", "user:" .. user_id)
    if user and next(user) then
        user_cache:set(user_id, user)
        return user
    end
    
    -- 3. 查数据库
    user = skynet.call(".mysql", "lua", "query",
        "SELECT * FROM users WHERE id = ?", {user_id})
    if user and #user > 0 then
        user = user[1]
        -- 写入 Redis
        skynet.call(".redis", "lua", "hmset", "user:" .. user_id, user)
        -- 写入内存缓存
        user_cache:set(user_id, user)
        return user
    end
    
    return nil
end

function M.update_user(user_id, data)
    -- 更新数据库
    skynet.call(".mysql", "lua", "update", "users", data, {id = user_id})
    
    -- 更新 Redis
    skynet.call(".redis", "lua", "hmset", "user:" .. user_id, data)
    
    -- 清除内存缓存
    user_cache:delete(user_id)
end

return M

批量操作

-- 不好的做法:逐个查询
for i = 1, 100 do
    local user = skynet.call(".db", "lua", "get_user", user_ids[i])
    -- 处理 user
end

-- 好的做法:批量查询
local users = skynet.call(".db", "lua", "batch_get_users", user_ids)
for i, user in ipairs(users) do
    -- 处理 user
end

-- 数据库服务实现批量查询
function CMD.batch_get_users(user_ids)
    if #user_ids == 0 then
        return {}
    end
    
    -- 构建 IN 查询
    local placeholders = {}
    for i = 1, #user_ids do
        placeholders[i] = "?"
    end
    
    local sql = string.format(
        "SELECT * FROM users WHERE id IN (%s)",
        table.concat(placeholders, ", ")
    )
    
    return query(sql, user_ids)
end

部署运维

启动脚本

#!/bin/bash
# scripts/start.sh

set -e

# 配置
SKYNET_BIN="./skynet"
CONFIG_FILE="./config/prod.conf"
PID_FILE="./skynet.pid"
LOG_FILE="./logs/skynet.log"

# 检查是否已运行
if [ -f "$PID_FILE" ]; then
    PID=$(cat "$PID_FILE")
    if ps -p "$PID" > /dev/null; then
        echo "Skynet 已在运行 (PID: $PID)"
        exit 1
    else
        echo "清理过期的 PID 文件"
        rm -f "$PID_FILE"
    fi
fi

# 创建日志目录
mkdir -p logs

# 启动 Skynet
echo "启动 Skynet..."
nohup "$SKYNET_BIN" "$CONFIG_FILE" > "$LOG_FILE" 2>&1 &
PID=$!

# 保存 PID
echo "$PID" > "$PID_FILE"

# 等待启动完成
sleep 2

# 检查是否成功启动
if ps -p "$PID" > /dev/null; then
    echo "Skynet 启动成功 (PID: $PID)"
    echo "日志文件: $LOG_FILE"
else
    echo "Skynet 启动失败,请检查日志"
    rm -f "$PID_FILE"
    exit 1
fi

停止脚本

#!/bin/bash
# scripts/stop.sh

set -e

PID_FILE="./skynet.pid"

if [ ! -f "$PID_FILE" ]; then
    echo "Skynet 未运行"
    exit 0
fi

PID=$(cat "$PID_FILE")

if ! ps -p "$PID" > /dev/null; then
    echo "Skynet 进程不存在,清理 PID 文件"
    rm -f "$PID_FILE"
    exit 0
fi

echo "停止 Skynet (PID: $PID)..."

# 发送 SIGTERM 信号(优雅停止)
kill -TERM "$PID"

# 等待进程退出
WAIT=0
MAX_WAIT=30

while ps -p "$PID" > /dev/null; do
    sleep 1
    WAIT=$((WAIT + 1))
    
    if [ $WAIT -ge $MAX_WAIT ]; then
        echo "进程未响应,强制停止..."
        kill -KILL "$PID"
        break
    fi
done

rm -f "$PID_FILE"
echo "Skynet 已停止"

健康检查

-- service/health_check.lua
local skynet = require "skynet"

local CMD = {}

function CMD.check()
    local result = {
        status = "ok",
        timestamp = os.time(),
        services = {},
    }
    
    -- 检查关键服务
    local critical_services = {
        ".auth",
        ".db",
        ".redis",
        ".gate",
    }
    
    for _, name in ipairs(critical_services) do
        local addr = skynet.localname(name)
        if not addr then
            result.status = "error"
            result.services[name] = "not_found"
        else
            -- 尝试 ping
            local ok = pcall(skynet.call, addr, "lua", "ping")
            if ok then
                result.services[name] = "ok"
            else
                result.status = "error"
                result.services[name] = "unresponsive"
            end
        end
    end
    
    -- 检查系统资源
    result.memory = collectgarbage("count") / 1024  -- MB
    
    return result
end

skynet.start(function()
    skynet.register(".health")
    
    skynet.dispatch("lua", function(session, source, cmd, ...)
        local f = assert(CMD[cmd])
        if session ~= 0 then
            skynet.retpack(f(...))
        else
            f(...)
        end
    end)
end)

监控告警

-- service/monitor.lua
local skynet = require "skynet"

local config = {
    check_interval = 60,  -- 每分钟检查一次
    
    thresholds = {
        memory_mb = 1024,     -- 内存超过 1GB
        cpu_percent = 80,     -- CPU 超过 80%
        queue_size = 1000,    -- 消息队列超过 1000
        latency_ms = 100,     -- 延迟超过 100ms
    },
    
    alert_channels = {
        email = "admin@example.com",
        webhook = "https://hooks.slack.com/xxx",
    },
}

local CMD = {}

function CMD.check()
    local alerts = {}
    
    -- 检查内存
    local mem = collectgarbage("count") / 1024
    if mem > config.thresholds.memory_mb then
        alerts[#alerts + 1] = {
            type = "memory",
            value = mem,
            threshold = config.thresholds.memory_mb,
            message = string.format("内存使用过高: %.2f MB", mem),
        }
    end
    
    -- 检查各服务
    local launcher = skynet.localname(".launcher")
    if launcher then
        local services = skynet.call(launcher, "lua", "list")
        for addr, name in pairs(services) do
            local ok, stat = pcall(skynet.call, addr, "debug", "STAT")
            if ok then
                -- 检查消息队列
                if stat.queue > config.thresholds.queue_size then
                    alerts[#alerts + 1] = {
                        type = "queue",
                        service = name,
                        value = stat.queue,
                        threshold = config.thresholds.queue_size,
                        message = string.format(
                            "服务 %s 消息队列过长: %d", name, stat.queue),
                    }
                end
            end
        end
    end
    
    -- 发送告警
    if #alerts > 0 then
        CMD.send_alerts(alerts)
    end
    
    return alerts
end

function CMD.send_alerts(alerts)
    for _, alert in ipairs(alerts) do
        skynet.error("ALERT:", alert.message)
        
        -- 发送邮件(调用邮件服务)
        pcall(skynet.send, ".email", "lua", "send", {
            to = config.alert_channels.email,
            subject = "Skynet 告警: " .. alert.type,
            body = alert.message,
        })
        
        -- 发送 Webhook(调用 HTTP 服务)
        pcall(skynet.send, ".http", "lua", "post", {
            url = config.alert_channels.webhook,
            data = alert,
        })
    end
end

skynet.start(function()
    skynet.register(".monitor")
    
    -- 定时检查
    skynet.fork(function()
        while true do
            skynet.sleep(config.check_interval * 100)
            pcall(CMD.check)
        end
    end)
    
    skynet.dispatch("lua", function(session, source, cmd, ...)
        local f = assert(CMD[cmd])
        if session ~= 0 then
            skynet.retpack(f(...))
        else
            f(...)
        end
    end)
end)

日志管理

结构化日志

-- lualib/logger.lua
local skynet = require "skynet"
local cjson = require "cjson"

local Logger = {}
Logger.__index = Logger

function Logger.new(service_name)
    return setmetatable({
        service = service_name,
        service_id = skynet.self(),
    }, Logger)
end

function Logger:log(level, event, data)
    local entry = {
        timestamp = os.time(),
        level = level,
        service = self.service,
        service_id = string.format(":%08x", self.service_id),
        event = event,
        data = data or {},
    }
    
    -- 输出 JSON 格式
    local json = cjson.encode(entry)
    skynet.error(json)
    
    -- 写入日志服务(异步)
    pcall(skynet.send, ".log_writer", "lua", "write", json)
end

function Logger:debug(event, data)
    self:log("DEBUG", event, data)
end

function Logger:info(event, data)
    self:log("INFO", event, data)
end

function Logger:warn(event, data)
    self:log("WARN", event, data)
end

function Logger:error(event, data)
    self:log("ERROR", event, data)
end

-- 使用示例
local logger = Logger.new("auth_service")

logger:info("user_login", {
    user_id = 12345,
    username = "player1",
    ip = "192.168.1.100",
    device = "iOS",
})

logger:error("db_error", {
    operation = "query",
    error = "connection timeout",
    sql = "SELECT * FROM users",
})

return Logger

常见问题与解决方案

问题 1:服务无响应

现象:某个服务长时间不返回结果

排查步骤

-- 1. 检查服务状态
local stat = skynet.call(service_addr, "debug", "STAT")
print("消息队列长度:", stat.queue)
print("CPU 时间:", stat.cpu)

-- 2. 检查是否死锁
-- 查看调用链
local info = skynet.call(service_addr, "debug", "INFO")

-- 3. 强制重启服务
skynet.call(".launcher", "lua", "kill", service_addr)

问题 2:内存泄漏

现象:内存持续增长

排查步骤

-- 1. 强制 GC
collectgarbage("collect")
local mem_before = collectgarbage("count")

-- 2. 检查大表
local function count_table_size(t, visited)
    visited = visited or {}
    if visited[t] then return 0 end
    visited[t] = true
    
    local count = 0
    for k, v in pairs(t) do
        count = count + 1
        if type(v) == "table" then
            count = count + count_table_size(v, visited)
        end
    end
    return count
end

-- 3. 使用对象池复用对象
-- 见上文 memory_pool.lua

-- 4. 定期清理过期数据
skynet.fork(function()
    while true do
        skynet.sleep(360000)  -- 每小时
        cleanup_expired_data()
        collectgarbage("collect")
    end
end)

问题 3:高延迟

现象:请求响应时间过长

优化方案

-- 1. 使用异步调用
local Timeout = require "timeout"
local ok, result = Timeout.call(5000, ".slow_service", "compute", data)

-- 2. 缓存热点数据
local Cache = require "cache"
local user = Cache.get_user(user_id)

-- 3. 批量操作
local users = skynet.call(".db", "lua", "batch_get", user_ids)

-- 4. 水平扩展
-- 创建多个实例分担负载
for i = 1, 5 do
    instances[i] = skynet.newservice("worker", i)
end

总结

本教程总结了 Skynet 开发的最佳实践:

  1. 代码规范:清晰的目录结构、命名规范、模块组织
  2. 错误处理:统一错误码、防御性编程、超时处理
  3. 安全防护:输入验证、SQL 注入防护、频率限制
  4. 性能优化:内存管理、缓存策略、批量操作
  5. 部署运维:启动/停止脚本、健康检查、监控告警
  6. 日志管理:结构化日志、日志服务
  7. 问题排查:常见问题的诊断和解决方案

遵循这些最佳实践,可以构建出高质量、可维护、安全稳定的 Skynet 游戏服务器。

参考资料

  1. Skynet 官方文档:https://github.com/cloudwu/skynet/wiki
  2. 云风的博客:https://blog.codingnow.com/
  3. Lua 性能优化:http://www.lua.org/gems/
  4. 游戏服务器架构设计最佳实践

继续阅读

探索更多技术文章

浏览归档,发现更多关于系统设计、工具链和工程实践的内容。

全部文章 返回首页