Helo sysmans!
          I use a Tru64 V4.0D without patchkit on an AS4100. After working
for about one and a half years,
        it begun to crash. After rebooting and working about five minutes to
several hours, it crashes again.
        This started yesterday. There weren't _any_ changes on the system.
          Although it is not a production system, it is heavily used.
(Running samba on a Windows network.)
        It has an RA7000 connected to with more than 150 GB disks in sum.
          Since I'm not a programmer, I can't do much with crash dump
informations (i.e. /var/adm/crash/*).
        The message file has lines:
...
Mar 12 12:36:41 as4100 vmunix: panic (cpu 0): System Uncorrectable Machine
Check 
Mar 12 12:36:41 as4100 vmunix: syncing disks... device string for dump =
SCSI 0 2000 0 1 100 0 0.
Mar 12 12:36:41 as4100 vmunix: DUMP.prom: dev SCSI 0 2000 0 1 100 0 0, block
262144
Mar 12 12:36:41 as4100 vmunix: device string for dump = SCSI 0 2000 0 1 100
0 0.
Mar 12 12:36:42 as4100 vmunix: DUMP.prom: dev SCSI 0 2000 0 1 100 0 0, block
262144
Mar 12 12:36:42 as4100 vmunix: Alpha boot: available memory from 0x11e8000
to 0x1fffc000
Mar 12 12:36:42 as4100 vmunix: Digital UNIX V4.0D  (Rev. 878); Wed Jan 12
13:14:32 MET 2000
...
        The last line indicates the next boot caused by the environment
variable auto_action set to restart.
        The crash-data file has some lines as (I try to cut'n'paste the ones
I think can be important):
...
_cpu:  49 
_system_string:  
can't read from process (address 0xfffffc006cdb6f56)
_ncpus:  1 
_avail_cpus:  1 
_partial_dump:  1 
_physmem(MBytes):  511 
_panic_string:  0xfffffc00006cda58 = "System Uncorrectable Machine Check " 
_paniccpu:  0 
_panic_thread:  0xfffffc00022a1b80 
_preserved_message_buffer_begin: 
struct {
    hdr = struct {
        msg_magic = 0x880524
        msg_bufx = 0xcdd
        msg_bufr = 0xbcb
        msg_size = 0xfe0
    }
    msg_bufc = "Alpha boot: available memory from 0x11e8000 to 0x1fffc000
...
_preserved_message_buffer_end: 
_kernel_process_status_begin: 
  PID	COMM
00000	kernel idle
00001	init
00003	kloadsrv
00026	update
00102	syslogd
00104	binlogd
00276	portmap
00278	mountd
00280	nfsd
00282	nfsiod
00337	sendmail
00404	svrSystem_mib
00405	svrMgt_mib
00406	os_mibs
00408	snmpd
00446	advfsd
00456	inetd
00489	cron
00517	lpd
00526	httpd
00529	smbd
00531	nmbd
00535	ircd
00536	iauth
00578	dtlogin
00594	httpd
00595	httpd
00596	httpd
00597	httpd
00598	httpd
00607	nv_mib
00626	ntl_reader
00628	netfmt
00633	ovspmd
00640	nvsecd
00650	ovwdb
00651	pmd
00652	nvpagerd
00653	nvlockd
00665	orsd
00666	trapd
00669	nvcold
00673	ovtopmd
00674	ovactiond
00675	nvcorrd
00676	actionsvr
00677	nvserverd
00679	actionsvr
00680	ovesmd
00681	snmpCollect
00684	ovelmd
00734	bash
00735	getty
00736	getty
00752	smbd
00799	more
00816	smbd
_kernel_process_status_end: 
_current_pid:  816 
_current_tid:  0xfffffc00022a1b80 
_proc_thread_list_begin: 
thread 0xfffffc00022a1b80 stopped at  [boot:1890 ,0xfffffc0000525bf8]
Source not available
_proc_thread_list_end: 
_dump_begin: 
>  0 boot() ["../../../../src/kernel/arch/alpha/machdep.c":1890,
0xfffffc0000525bf8]
nmp = 0x1000
rs = -4398039198160
mycpu = 0
rpb = 0xfffffc00006ef9d8
rpb_cpu = 0x1
item_list = struct {
    function = 0
    out_flags = 984397546
    in_flags = 0
    rtn_status = 18446739675665874836
    next_function = (nil)
    input_data = 1
    output_data = 18446739675750817792
}
   1 panic(0xfffffc001f9e62c0, 0x17, 0xfffffc001fe0e000, 0xfffffc001fe0e000,
0xde67b) ["../../../../src/kernel/bsd/subr_prf.c":737, 0xfffffc00002826b0]
   2 thread_block() ["../../../../src/kernel/kern/sched_prim.c":2101,
0xfffffc00002b4664]
thread = 0xfffffc00022a1b80
new_thread = 0xfffffc0000200100
mycpu = 0
myprocessor = 0xfffffc0000200100
s = 5
pset = 0xfffffc00006f9630
   3 thread_preempt(thread = 0x26, processor = 0xfffffc0000200100)
["../../../../src/kernel/kern/sched_prim.c":3975, 0xfffffc00002b702c]
s = 2
pset = 0xfffffc0000723ff0
   4 boot() ["../../../../src/kernel/arch/alpha/machdep.c":1836,
0xfffffc0000525acc]
nmp = 0x1000
rs = -4398039198160
mycpu = 7300304
rpb = 0xfffffc00006ef9d8
rpb_cpu = 0xde67a
item_list = struct {
    function = 18446739675663059000
    out_flags = 18488
    in_flags = 4294966272
    rtn_status = 1632
    next_function = 0x4
    input_data = 21
    output_data = 18446739675668618248
}
   5 panic(0x303000000d0a3030, 0x15, 0xfffffc0000551c08, 0xfffffffffffffff0,
0xfffffc00011ec308) ["../../../../src/kernel/bsd/subr_prf.c":824,
0xfffffc0000282874]
   6 machcheck(0x1, 0x300000000, 0xfffffc0000004838, 0xffffffffa0ffb0b8,
0xfffffc0000004838) ["../../../../src/kernel/arch/alpha/hal/kn300.c":1984,
0xfffffc0000551c84]
   7 mach_error(0xfffffc0000004838, 0xffffffffa0ffb0b8, 0xfffffc0000004838,
0x0, 0xfffffc0000522000)
["../../../../src/kernel/arch/alpha/hal/cpusw.c":873, 0xfffffc0000536a88]
   8 _XentInt(0x0, 0xfffffc00005222f0, 0xfffffc00006f70a0, 0x15, 0x1)
["../../../../src/kernel/arch/alpha/locore.s":1312, 0xfffffc0000521ffc]
   9 _XentMM(0x0, 0x0, 0x0, 0xfffffc0002a5f088, 0xfffffc001caecf00)
["../../../../src/kernel/arch/alpha/locore.s":1610, 0xfffffc00005222ec]
_dump_end: 
...
_kernel_memory_fault_data_begin:  
struct {
    fault_va = 0x0
    fault_pc = 0x0
    fault_ra = 0x0
    fault_sp = 0x0
    access = 0x0
    status = 0x0
    cpunum = 0x0
    count = 0x0
    pcb = (nil)
    thread = (nil)
    task = (nil)
    proc = (nil)
} 
_kernel_memory_fault_data_end:  
...
          I suspect a faulty CPU and/or memory, since there were no changes,
as I mentioned before. Anyone
        please help, and sorry for the long mail!
          Regards,
                                                        Nagy Akos, alias
Nagyak (from Hungary)
Received on Mon Mar 12 2001 - 12:13:23 NZDT