#!/bin/bash
# If QMI link is down, restart quectel-qmi. Run from cron every 15 min.
# Logs to /app/qmi-log/qmi-failover.log.

QMI_LOG_DIR="/app/qmi-log"
LOG="$QMI_LOG_DIR/qmi-failover.log"
mkdir -p "$QMI_LOG_DIR" 2>/dev/null || true
log() { echo "$(date -Iseconds) $*" >> "$LOG"; }

DEV="/dev/cdc-wdm0"
PING_TARGET="8.8.8.8"
MAX_TRIES_BEFORE_RESET=3
RESET_POLL_MAX_SEC=$((15 * 60))
RESET_POLL_INTERVAL_SEC=10
STATE_DIR="/var/lib/matelex"
STATE_FILE="$STATE_DIR/qmi-failover.state"
REBOOT_THRESHOLD=4
REBOOT_DELAY_SEC=10

log "=== qmi-failover run ==="

find_qmi_iface()
{
	local w n
	for w in /sys/class/net/wwan*; do
		[ -d "$w" ] || continue
		n="${w##*/}"
		[ -f "/sys/class/net/$n/qmi/raw_ip" ] && echo "$n" && return 0
	done
	for w in /sys/class/net/*; do
		[ -d "$w" ] || continue
		n="${w##*/}"
		[ -f "/sys/class/net/$n/qmi/raw_ip" ] && echo "$n" && return 0
	done
	return 1
}

IFACE="$(find_qmi_iface 2>/dev/null || true)"

get_qmi_network_status()
{
	/usr/bin/qmi-network "$DEV" status 2>/dev/null | /usr/bin/awk '/Status:/ {print $2}'
}

ping_ok()
{
	[ -n "$IFACE" ] || return 1
	/usr/bin/ping -w 3 -I "$IFACE" "$PING_TARGET" >> "$LOG" 2>&1
	return $?
}

ping_on_iface()
{
	local iface="$1"
	/usr/bin/ping -w 3 -I "$iface" "$PING_TARGET" >> "$LOG" 2>&1
	return $?
}

iface_online()
{
	local iface="$1"
	local carrier operstate

	[ -d "/sys/class/net/$iface" ] || return 1

	if [ -f "/sys/class/net/$iface/operstate" ]; then
		operstate=$(/bin/cat "/sys/class/net/$iface/operstate" 2>/dev/null)
		[ "$operstate" = "up" ] || {
			log "primary check: $iface operstate=${operstate:-unknown}"
			return 1
		}
	fi

	if [ -f "/sys/class/net/$iface/carrier" ]; then
		carrier=$(/bin/cat "/sys/class/net/$iface/carrier" 2>/dev/null)
		[ "$carrier" = "1" ] || {
			log "primary check: $iface carrier=${carrier:-unknown}"
			return 1
		}
	fi

	if ping_on_iface "$iface"; then
		log "primary check: $iface can ping $PING_TARGET"
		return 0
	fi

	log "primary check: $iface cannot ping $PING_TARGET"
	return 1
}

primary_network_online()
{
	if iface_online eth1; then
		return 0
	fi
	if iface_online wlan0; then
		return 0
	fi
	return 1
}

load_fail_count()
{
	FAIL_COUNT=0
	[ -d "$STATE_DIR" ] || /bin/mkdir -p "$STATE_DIR" 2>/dev/null || true
	if [ -f "$STATE_FILE" ]; then
		FAIL_COUNT="$(/usr/bin/awk -F= '/^fail_count=/ {print $2}' "$STATE_FILE" 2>/dev/null)"
	fi
	case "$FAIL_COUNT" in
		''|*[!0-9]*) FAIL_COUNT=0 ;;
	esac
}

save_fail_count()
{
	/bin/mkdir -p "$STATE_DIR" 2>/dev/null || true
	/usr/bin/printf "fail_count=%s\n" "$FAIL_COUNT" > "$STATE_FILE"
}

reset_fail_count()
{
	FAIL_COUNT=0
	save_fail_count
}

reset_fail_count_if_needed()
{
	local reason="$1"
	if [ "$FAIL_COUNT" -ne 0 ]; then
		log "resetting QMI fail count from $FAIL_COUNT (reason: $reason)"
	fi
	reset_fail_count
}

record_failure_and_maybe_reboot()
{
	local reason="$1"

	if primary_network_online; then
		reset_fail_count_if_needed "primary network online before reboot count"
		log "skipping reboot count for '$reason' because eth1/wlan0 is online"
		return 0
	fi

	FAIL_COUNT=$((FAIL_COUNT + 1))
	save_fail_count
	log "long-term failure count incremented to $FAIL_COUNT (reason: $reason)"

	if [ "$FAIL_COUNT" -ge "$REBOOT_THRESHOLD" ]; then
		if primary_network_online; then
			reset_fail_count_if_needed "primary network online before reboot"
			log "failure threshold reached, but eth1/wlan0 is online; reboot cancelled"
			return 0
		fi
		log "failure threshold reached ($FAIL_COUNT >= $REBOOT_THRESHOLD); rebooting in ${REBOOT_DELAY_SEC}s"
		sleep "$REBOOT_DELAY_SEC"
		if primary_network_online; then
			reset_fail_count_if_needed "primary network online after reboot delay"
			log "reboot delay elapsed, but eth1/wlan0 is online; reboot cancelled"
			return 0
		fi
		/bin/systemctl reboot >> "$LOG" 2>&1 || true
	fi
}

do_stop_start()
{
	log "restarting quectel-qmi (stop -> start)"
	/usr/lib/matelex/quectel-qmi stop >> "$LOG" 2>&1 || true
	sleep 2
	/usr/lib/matelex/quectel-qmi start >> "$LOG" 2>&1
	return $?
}

refresh_qmi_once_no_reboot()
{
	local reason="$1"

	reset_fail_count_if_needed "$reason"
	log "classification '$reason': refreshing QMI service once in this cron run; no DMS reset, no reboot count"
	do_stop_start >> "$LOG" 2>&1 || true
	log "=== qmi-failover done ($reason-refresh-only) ==="
	exit 0
}

qmi_healthy()
{
	local status ping_rc

	IFACE="$(find_qmi_iface 2>/dev/null || true)"
	if [ -z "$IFACE" ]; then
		log "qmi health: no QMI interface (qmi/raw_ip)"
		return 1
	fi

	ping_ok
	ping_rc=$?
	status="$(get_qmi_network_status)"
	log "qmi health: iface=$IFACE status=${status:-unknown}; ping_rc=$ping_rc"

	[ "$status" = "connected" ] && [ $ping_rc -eq 0 ]
}

do_dms_reset_then_wait()
{
	local waited=0

	log "triggering dms-reset after ${MAX_TRIES_BEFORE_RESET} failed attempts"
	/usr/bin/qmicli -d "$DEV" --dms-reset >> "$LOG" 2>&1 || true

	log "polling up to ${RESET_POLL_MAX_SEC}s after reset (interval ${RESET_POLL_INTERVAL_SEC}s)"
	while [ $waited -lt $RESET_POLL_MAX_SEC ]; do
		if qmi_healthy; then
			log "recovered during post-reset polling"
			return 0
		fi
		sleep $RESET_POLL_INTERVAL_SEC
		waited=$((waited + RESET_POLL_INTERVAL_SEC))
	done

	log "post-reset polling timed out; restarting quectel-qmi.service"
	/bin/systemctl restart quectel-qmi.service >> "$LOG" 2>&1 || true
	sleep 3
	if qmi_healthy; then
		log "recovered after service restart"
		return 0
	fi

	log "still not recovered after reset+restart"
	return 1
}

classify_sim_and_registration()
{
	local card_status serving_system system_info

	card_status=$(/usr/bin/qmicli -d "$DEV" --uim-get-card-status 2>&1 || true)
	log "uim card status: $(echo "$card_status" | /usr/bin/tr '\n' ' ' | /usr/bin/sed 's/[[:space:]]\+/ /g')"

	if echo "$card_status" | /usr/bin/grep -qi "Card state: 'absent'"; then
		echo "no-sim"
		return 0
	fi
	if echo "$card_status" | /usr/bin/grep -Eqi "PIN1 state: '(enabled-not-verified|blocked|permanently-blocked)'|UPIN state: '(enabled-not-verified|blocked|permanently-blocked)'"; then
		echo "sim-locked"
		return 0
	fi

	serving_system=$(/usr/bin/qmicli -d "$DEV" --nas-get-serving-system 2>&1 || true)
	log "nas serving system: $(echo "$serving_system" | /usr/bin/tr '\n' ' ' | /usr/bin/sed 's/[[:space:]]\+/ /g')"

	if echo "$serving_system" | /usr/bin/grep -q "Packet switched: 'all-calls'"; then
		echo "ps-barring"
		return 0
	fi

	if echo "$serving_system" | /usr/bin/grep -Eqi "Registration state: '(not-registered|not searching|registration-denied|unknown)'"; then
		echo "not-registered"
		return 0
	fi

	system_info=$(/usr/bin/qmicli -d "$DEV" --nas-get-system-info 2>&1 || true)
	log "nas system info: $(echo "$system_info" | /usr/bin/tr '\n' ' ' | /usr/bin/sed 's/[[:space:]]\+/ /g')"
	if echo "$system_info" | /usr/bin/grep -Eqi "Service status: '(none|limited|no-service)'|Domain: 'none'"; then
		echo "not-registered"
		return 0
	fi

	echo "unknown"
}

recover_qmi_or_count_failure()
{
	local reason="$1"
	local tries=0

	while [ $tries -lt $MAX_TRIES_BEFORE_RESET ]; do
		tries=$((tries + 1))
		log "recoverable QMI fault '$reason': stop/start attempt $tries/$MAX_TRIES_BEFORE_RESET"
		do_stop_start
		sleep 2
		if qmi_healthy; then
			reset_fail_count_if_needed "QMI recovered after stop/start"
			log "=== qmi-failover done (recovered) ==="
			exit 0
		fi
	done

	do_dms_reset_then_wait
	if [ $? -eq 0 ]; then
		reset_fail_count_if_needed "QMI recovered after DMS reset"
		log "=== qmi-failover done (recovered) ==="
		exit 0
	fi

	if primary_network_online; then
		reset_fail_count_if_needed "primary network online after failed QMI recovery"
		log "QMI recovery failed for '$reason', but eth1/wlan0 is online; no reboot count"
		log "=== qmi-failover done (primary-online-after-recovery) ==="
		exit 0
	fi

	record_failure_and_maybe_reboot "$reason"
	log "=== qmi-failover done (failed) ==="
	exit 0
}

load_fail_count

if [ ! -c "$DEV" ]; then
	log "no $DEV device node; cannot manage QMI, exiting"
	log "=== qmi-failover done (no-device) ==="
	exit 0
fi

classification="$(classify_sim_and_registration)"
case "$classification" in
	no-sim|sim-locked|not-registered)
		refresh_qmi_once_no_reboot "$classification"
		;;
	ps-barring)
		log "PS barring detected; entering QMI recovery flow"
		recover_qmi_or_count_failure "ps-barring"
		;;
esac

if [ -z "$IFACE" ]; then
	log "no QMI interface (qmi/raw_ip); classified as qmi-service-failure"
	recover_qmi_or_count_failure "qmi-service-failure:no-qmi-iface"
fi

if qmi_healthy; then
	reset_fail_count_if_needed "QMI healthy"
	log "=== qmi-failover done (ok) ==="
	exit 0
fi

status="$(get_qmi_network_status)"
if [ "$status" = "connected" ]; then
	recover_qmi_or_count_failure "qmi-connected-but-no-ping"
else
	recover_qmi_or_count_failure "qmi-service-failure:status-${status:-unknown}"
fi

exit 0