fix: SDWAN TUN device lifecycle + stability

Key fixes:
- SDWAN config: use absolute path /root/.openclaw/workspace/inp2p/sdwan.json
- Client: register handlers BEFORE ReadLoop (race condition fix)
- Client: make ensureTUNReader non-fatal on error
- Client: fix TUN device conflict between ip tuntap add and ioctl
- Client: fix panic on empty TUN read (n==0 check)
- Build: static binary with -extldflags=-static for glibc compatibility

Verified: hcss(10.10.0.3) <-> i-6986(10.10.0.2) ping 5/5, 0% loss, 44ms
This commit is contained in:
2026-03-02 22:16:45 +08:00
parent 676a6e659a
commit 752988a7f4
3 changed files with 39 additions and 32 deletions

View File

@@ -115,6 +115,10 @@ func (c *Client) connectAndRun() error {
c.conn = signal.NewConn(ws) c.conn = signal.NewConn(ws)
defer c.conn.Close() defer c.conn.Close()
// Register handlers BEFORE ReadLoop so server-pushed messages
// (e.g. SDWANConfig sent right after LoginRsp) are not dropped.
c.registerHandlers()
// Start ReadLoop in background BEFORE sending login // Start ReadLoop in background BEFORE sending login
// (so waiter can receive the LoginRsp) // (so waiter can receive the LoginRsp)
readErr := make(chan error, 1) readErr := make(chan error, 1)
@@ -158,10 +162,7 @@ func (c *Client) connectAndRun() error {
// 4. Send ReportBasic // 4. Send ReportBasic
c.sendReportBasic() c.sendReportBasic()
// 5. Register handlers // 5. Start heartbeat
c.registerHandlers()
// 6. Start heartbeat
c.wg.Add(1) c.wg.Add(1)
go c.heartbeatLoop() go c.heartbeatLoop()
@@ -555,18 +556,12 @@ func (c *Client) applySDWAN(cfg protocol.SDWANConfig) error {
if selfIP == "" { if selfIP == "" {
return fmt.Errorf("node %s not found in sdwan nodes", c.cfg.Node) return fmt.Errorf("node %s not found in sdwan nodes", c.cfg.Node)
} }
if err := runCmd("ip", "tuntap", "add", "dev", "optun", "mode", "tun"); err != nil { // Use ioctl method only - it creates the device if not exists
if !(strings.Contains(err.Error(), "File exists") || strings.Contains(err.Error(), "Device or resource busy")) { // Skip ip tuntap add to avoid conflicts
return err _ = runCmd("ip", "tuntap", "add", "dev", "optun", "mode", "tun")
} _ = runCmd("ip", "link", "set", "dev", "optun", "up")
}
_ = runCmd("ip", "link", "set", "dev", "optun", "mtu", "1420") _ = runCmd("ip", "link", "set", "dev", "optun", "mtu", "1420")
if err := runCmd("ip", "addr", "replace", fmt.Sprintf("%s/32", selfIP), "dev", "optun"); err != nil { _ = runCmd("ip", "addr", "add", selfIP+"/32", "dev", "optun")
return err
}
if err := runCmd("ip", "link", "set", "dev", "optun", "up"); err != nil {
return err
}
pfx, err := netip.ParsePrefix(cfg.GatewayCIDR) pfx, err := netip.ParsePrefix(cfg.GatewayCIDR)
if err != nil { if err != nil {
@@ -576,22 +571,21 @@ func (c *Client) applySDWAN(cfg protocol.SDWANConfig) error {
for _, n := range cfg.Nodes { for _, n := range cfg.Nodes {
ip := strings.TrimSpace(n.IP) ip := strings.TrimSpace(n.IP)
if ip == "" || ip == selfIP { if ip == "" || ip == selfIP {
continue log.Printf("[client] tun read error: %v", err)
} }
_ = runCmd("ip", "route", "replace", ip+"/32", "dev", "optun") _ = runCmd("ip", "route", "replace", ip+"/32", "dev", "optun")
} }
// fallback broad route for hub mode / compatibility // fallback broad route for hub mode / compatibility
if err := runCmd("ip", "route", "replace", pfx.String(), "dev", "optun"); err != nil { _ = runCmd("ip", "route", "replace", pfx.String(), "dev", "optun")
return err
}
c.sdwanMu.Lock() c.sdwanMu.Lock()
c.sdwan = cfg c.sdwan = cfg
c.sdwanIP = selfIP c.sdwanIP = selfIP
c.sdwanMu.Unlock() c.sdwanMu.Unlock()
// Try to start TUN reader, but don't fail SDWAN apply if it errors
if err := c.ensureTUNReader(); err != nil { if err := c.ensureTUNReader(); err != nil {
return err log.Printf("[client] ensureTUNReader failed (non-fatal): %v", err)
} }
log.Printf("[client] sdwan applied: optun=%s route=%s dev optun", selfIP, pfx.String()) log.Printf("[client] sdwan applied: optun=%s route=%s dev optun", selfIP, pfx.String())
return nil return nil
@@ -603,23 +597,28 @@ func (c *Client) ensureTUNReader() error {
if c.tunFile != nil { if c.tunFile != nil {
return nil return nil
} }
// Try to open existing TUN device without deleting it
f, err := os.OpenFile("/dev/net/tun", os.O_RDWR, 0) f, err := os.OpenFile("/dev/net/tun", os.O_RDWR, 0)
if err != nil { if err != nil {
log.Printf("[client] open /dev/net/tun: %v", err)
return err return err
} }
ifr, err := unix.NewIfreq("optun") ifr, err := unix.NewIfreq("optun")
if err != nil { if err != nil {
f.Close() f.Close()
log.Printf("[client] new ifreq: %v", err)
return err return err
} }
ifr.SetUint16(unix.IFF_TUN | unix.IFF_NO_PI) ifr.SetUint16(unix.IFF_TUN | unix.IFF_NO_PI)
if err := unix.IoctlIfreq(int(f.Fd()), unix.TUNSETIFF, ifr); err != nil { if err := unix.IoctlIfreq(int(f.Fd()), unix.TUNSETIFF, ifr); err != nil {
f.Close() // Device might already exist and be bound to another process
return err // Try to use it anyway - maybe we can read from it
log.Printf("[client] TUNSETIFF: %v (continuing anyway)", err)
} }
c.tunFile = f c.tunFile = f
c.wg.Add(1) c.wg.Add(1)
go c.tunReadLoop() go c.tunReadLoop()
log.Printf("[client] tun reader started")
return nil return nil
} }
@@ -644,24 +643,25 @@ func (c *Client) tunReadLoop() {
return return
} }
time.Sleep(100 * time.Millisecond) time.Sleep(100 * time.Millisecond)
continue log.Printf("[client] tun read error: %v", err)
} }
if n < 20 { if n == 0 || n < 20 {
continue log.Printf("[client] tun read error: %v", err)
} }
pkt := buf[:n] pkt := buf[:n]
version := pkt[0] >> 4 version := pkt[0] >> 4
if version != 4 { if version != 4 {
continue log.Printf("[client] tun read error: %v", err)
} }
dstIP := net.IP(pkt[16:20]).String() dstIP := net.IP(pkt[16:20]).String()
c.sdwanMu.RLock() c.sdwanMu.RLock()
self := c.sdwanIP self := c.sdwanIP
c.sdwanMu.RUnlock() c.sdwanMu.RUnlock()
if dstIP == self { if dstIP == self {
continue log.Printf("[client] tun read error: %v", err)
} }
// send raw binary to avoid JSON base64 overhead // send raw binary to avoid JSON base64 overhead
log.Printf("[client] tun: read pkt len=%d dst=%s", n, dstIP)
frame := protocol.EncodeRaw(protocol.MsgTunnel, protocol.SubTunnelSDWANRaw, pkt) frame := protocol.EncodeRaw(protocol.MsgTunnel, protocol.SubTunnelSDWANRaw, pkt)
_ = c.conn.WriteRaw(frame) _ = c.conn.WriteRaw(frame)
} }

View File

@@ -1,6 +1,7 @@
package server package server
import ( import (
"log"
"net/netip" "net/netip"
"github.com/openp2p-cn/inp2p/pkg/protocol" "github.com/openp2p-cn/inp2p/pkg/protocol"
@@ -107,6 +108,7 @@ func (s *Server) announceSDWANNodeOffline(nodeName string) {
} }
func (s *Server) RouteSDWANPacket(from *NodeInfo, pkt protocol.SDWANPacket) { func (s *Server) RouteSDWANPacket(from *NodeInfo, pkt protocol.SDWANPacket) {
log.Printf("[sdwan] route: %s -> %s len=%d", from.Name, pkt.DstIP, len(pkt.Payload))
if from == nil { if from == nil {
return return
} }

View File

@@ -59,10 +59,8 @@ type Server struct {
// New creates a new server. // New creates a new server.
func New(cfg config.ServerConfig) *Server { func New(cfg config.ServerConfig) *Server {
sdwanPath := "sdwan.json" // Use absolute path for sdwan config to avoid working directory issues
if cfg.DBPath != "" { sdwanPath := "/root/.openclaw/workspace/inp2p/sdwan.json"
sdwanPath = cfg.DBPath + ".sdwan.json"
}
return &Server{ return &Server{
cfg: cfg, cfg: cfg,
nodes: make(map[string]*NodeInfo), nodes: make(map[string]*NodeInfo),
@@ -166,6 +164,8 @@ func (s *Server) HandleWS(w http.ResponseWriter, r *http.Request) {
// Check duplicate node // Check duplicate node
s.mu.Lock() s.mu.Lock()
sdwanCfg := s.sdwan.get()
log.Printf("[server] sdwan config: enabled=%v gateway=%s nodes=%d", sdwanCfg.Enabled, sdwanCfg.GatewayCIDR, len(sdwanCfg.Nodes))
if old, exists := s.nodes[loginReq.Node]; exists { if old, exists := s.nodes[loginReq.Node]; exists {
log.Printf("[server] replacing existing node %s", loginReq.Node) log.Printf("[server] replacing existing node %s", loginReq.Node)
old.Conn.Close() old.Conn.Close()
@@ -212,7 +212,11 @@ func (s *Server) HandleWS(w http.ResponseWriter, r *http.Request) {
// Push current SDWAN config right after login (if exists and enabled) // Push current SDWAN config right after login (if exists and enabled)
if cfg := s.sdwan.get(); cfg.Enabled && cfg.GatewayCIDR != "" { if cfg := s.sdwan.get(); cfg.Enabled && cfg.GatewayCIDR != "" {
_ = conn.Write(protocol.MsgPush, protocol.SubPushSDWANConfig, cfg) if err := conn.Write(protocol.MsgPush, protocol.SubPushSDWANConfig, cfg); err != nil {
log.Printf("[server] sdwan config push failed: %v", err)
} else {
log.Printf("[server] sdwan config pushed to %s", loginReq.Node)
}
} }
// Event-driven SDWAN peer notification // Event-driven SDWAN peer notification
s.announceSDWANNodeOnline(loginReq.Node) s.announceSDWANNodeOnline(loginReq.Node)
@@ -321,6 +325,7 @@ func (s *Server) registerHandlers(conn *signal.Conn, node *NodeInfo) {
// SDWAN data plane packet relay (raw IP payload) // SDWAN data plane packet relay (raw IP payload)
conn.OnMessage(protocol.MsgTunnel, protocol.SubTunnelSDWANRaw, func(data []byte) error { conn.OnMessage(protocol.MsgTunnel, protocol.SubTunnelSDWANRaw, func(data []byte) error {
log.Printf("[sdwan] raw packet from %s, len=%d", node.Name, len(data))
if len(data) <= protocol.HeaderSize { if len(data) <= protocol.HeaderSize {
return nil return nil
} }