Index: arch/i386/conf/GENERIC =================================================================== RCS file: /cvsroot/src/sys/arch/i386/conf/GENERIC,v retrieving revision 1.592 diff -u -r1.592 GENERIC --- arch/i386/conf/GENERIC 2004/01/16 17:02:05 1.592 +++ arch/i386/conf/GENERIC 2004/02/13 16:49:21 @@ -46,6 +46,11 @@ # delay between "rebooting ..." message and hardware reset, in milliseconds #options CPURESET_DELAY=2000 +options SD_CLUSTER +options WD_CLUSTER +options CLUSTER_STATS +options CLUSTER_DEBUG + # This option allows you to force a serial console at the specified # I/O address. see console(4) for details. #options CONSDEVNAME="\"com\"",CONADDR=0x2f8,CONSPEED=57600 Index: conf/files =================================================================== RCS file: /cvsroot/src/sys/conf/files,v retrieving revision 1.653 diff -u -r1.653 files --- conf/files 2004/02/03 19:46:18 1.653 +++ conf/files 2004/02/13 16:49:55 @@ -153,6 +153,14 @@ # defparam opt_bufcache.h BUFCACHE BUFPAGES +# +# i/o clustering options +# +defflag opt_cluster.h CLUSTER_DEBUG +defparam opt_cluster.h MAX_CLUSTERS +defparam opt_cluster.h MAX_CLUSTEREDVA +defflag opt_cluster.h CLUSTER_STATS + # userconf # defflag USERCONF @@ -1108,6 +1120,7 @@ file dev/vnd.c vnd needs-flag file kern/core_elf32.c exec_elf32 file kern/core_elf64.c exec_elf64 +file kern/subr_cluster.c sd_cluster | wd_cluster file kern/core_netbsd.c file kern/cnmagic.c file kern/exec_aout.c exec_aout Index: dev/ata/files.ata =================================================================== RCS file: /cvsroot/src/sys/dev/ata/files.ata,v retrieving revision 1.8 diff -u -r1.8 files.ata --- dev/ata/files.ata 2003/10/08 20:58:00 1.8 +++ dev/ata/files.ata 2004/02/13 16:49:55 @@ -6,6 +6,8 @@ # appropriate devices. # ATA disks +defflag opt_ata.h WDCDEBUG WD_CLUSTER + device wd: disk attach wd at ata_hl file dev/ata/wd.c wd needs-flag Index: dev/ata/wd.c =================================================================== RCS file: /cvsroot/src/sys/dev/ata/wd.c,v retrieving revision 1.273 diff -u -r1.273 wd.c --- dev/ata/wd.c 2004/01/10 14:39:50 1.273 +++ dev/ata/wd.c 2004/02/13 16:49:57 @@ -68,6 +68,10 @@ #include __KERNEL_RCSID(0, "$NetBSD: wd.c,v 1.273 2004/01/10 14:39:50 yamt Exp $"); +#if defined(_KERNEL_OPT) +#include "opt_ata.h" +#endif + #ifndef WDCDEBUG #define WDCDEBUG #endif /* WDCDEBUG */ @@ -118,6 +122,15 @@ #define WDLABELDEV(dev) (MAKEWDDEV(major(dev), WDUNIT(dev), RAW_PART)) +#ifdef WD_CLUSTER +static int dowdcluster = 1; +#define WDBUFQ_GET(bs, s) dowdcluster ? cluster((bs), (s)) : BUFQ_GET(bs) +#else +#define WDBUFQ_GET(bs, s) BUFQ_GET(bs) +#endif + +#define WDBUFQ_PUT(bs, bp) BUFQ_PUT(bs, bp) + #define DEBUG_INTR 0x01 #define DEBUG_XFERS 0x02 #define DEBUG_STATUS 0x04 @@ -283,6 +296,9 @@ #else bufq_alloc(&wd->sc_q, BUFQ_DISKSORT|BUFQ_SORT_RAWBLOCK); #endif +#ifdef WD_CLUSTER + cluster_init(); +#endif SLIST_INIT(&wd->sc_bslist); wd->atabus = adev->adev_bustype; @@ -436,8 +452,8 @@ s = splbio(); - /* Kill off any queued buffers. */ - while ((bp = BUFQ_GET(&sc->sc_q)) != NULL) { + /* Kill off any queued buffers. */ + while ((bp = WDBUFQ_GET(&sc->sc_q, 0)) != NULL) { bp->b_error = EIO; bp->b_flags |= B_ERROR; bp->b_resid = bp->b_bcount; @@ -531,6 +547,11 @@ bp->b_rawblkno = blkno; +#ifdef WD_CLUSTER + if (dowdcluster) + cluster_init_buf(bp); +#endif + /* * If the transfer about to be attempted contains only a block that * is known to be bad then return an error for the transfer without @@ -553,7 +574,7 @@ /* Queue transfer on drive, activate drive and controller if idle. */ s = splbio(); - BUFQ_PUT(&wd->sc_q, bp); + WDBUFQ_PUT(&wd->sc_q, bp); wdstart(wd); splx(s); return; @@ -579,7 +600,8 @@ while (wd->openings > 0) { /* Is there a buf for us ? */ - if ((bp = BUFQ_GET(&wd->sc_q)) == NULL) + bp = WDBUFQ_GET(&wd->sc_q, wd->sc_dk.dk_label->d_secsize); + if (bp == NULL) return; /* Index: dev/scsipi/files.scsipi =================================================================== RCS file: /cvsroot/src/sys/dev/scsipi/files.scsipi,v retrieving revision 1.37 diff -u -r1.37 files.scsipi --- dev/scsipi/files.scsipi 2003/10/08 10:58:13 1.37 +++ dev/scsipi/files.scsipi 2004/02/13 16:50:06 @@ -6,6 +6,7 @@ defflag opt_scsi.h SCSIVERBOSE ST_ENABLE_EARLYWARN SES_ENABLE_PASSTHROUGH SCSI_OLD_NOINQUIRY + SD_CLUSTER defparam opt_scsi.h ST_MOUNT_DELAY SDRETRIES SD_IO_TIMEOUT defflag opt_scsipi_debug.h SCSIPI_DEBUG Index: dev/scsipi/sd.c =================================================================== RCS file: /cvsroot/src/sys/dev/scsipi/sd.c,v retrieving revision 1.214 diff -u -r1.214 sd.c --- dev/scsipi/sd.c 2003/12/23 13:12:25 1.214 +++ dev/scsipi/sd.c 2004/02/13 16:50:07 @@ -56,8 +56,11 @@ #include __KERNEL_RCSID(0, "$NetBSD: sd.c,v 1.214 2003/12/23 13:12:25 pk Exp $"); +#if defined(_KERNEL_OPT) #include "opt_scsi.h" #include "opt_bufq.h" +#endif + #include "rnd.h" #include @@ -95,6 +98,16 @@ #define SDLABELDEV(dev) (MAKESDDEV(major(dev), SDUNIT(dev), RAW_PART)) +#ifdef SD_CLUSTER +static int dosdcluster = 1; +#define SDBUFQ_GET(bs, s) dosdcluster ? cluster((bs), (s)) : BUFQ_GET(bs) +#else +#define SDBUFQ_GET(bs, s) BUFQ_GET(bs) +#endif + +#define SDBUFQ_PUT(bs, bp) BUFQ_PUT(bs, bp) + + int sdlock __P((struct sd_softc *)); void sdunlock __P((struct sd_softc *)); void sdminphys __P((struct buf *)); @@ -234,6 +247,9 @@ #else bufq_alloc(&sd->buf_queue, BUFQ_DISKSORT|BUFQ_SORT_RAWBLOCK); #endif +#ifdef SD_CLUSTER + cluster_init(); +#endif /* * Store information needed to contact our base driver @@ -359,7 +375,7 @@ s = splbio(); /* Kill off any queued buffers. */ - while ((bp = BUFQ_GET(&sd->buf_queue)) != NULL) { + while ((bp = SDBUFQ_GET(&sd->buf_queue, 0)) != NULL) { bp->b_error = EIO; bp->b_flags |= B_ERROR; bp->b_resid = bp->b_bcount; @@ -667,6 +683,7 @@ return (0); } + /* * Actually translate the requested transfer into one the physical driver * can understand. The transfer is described by a buf and will include @@ -749,6 +766,10 @@ bp->b_rawblkno = blkno; +#ifdef SD_CLUSTER + if (dosdcluster) + cluster_init_buf(bp); +#endif s = splbio(); /* @@ -757,7 +778,7 @@ * XXX Only do disksort() if the current operating mode does not * XXX include tagged queueing. */ - BUFQ_PUT(&sd->buf_queue, bp); + SDBUFQ_PUT(&sd->buf_queue, bp); /* * Tell the device to get going on the transfer if it's @@ -825,7 +846,7 @@ /* * See if there is a buf with work for us to do.. */ - if ((bp = BUFQ_GET(&sd->buf_queue)) == NULL) + if ((bp = SDBUFQ_GET(&sd->buf_queue, lp->d_secsize)) == NULL) return; /* @@ -1180,6 +1201,9 @@ if ((flag & FWRITE) == 0) return (EBADF); return (sd_setcache(sd, *(int *) addr)); + + /* Not supported on this device. */ + return (EOPNOTSUPP); case DIOCCACHESYNC: /* Index: kern/subr_cluster.c =================================================================== RCS file: subr_cluster.c diff -N subr_cluster.c --- /dev/null Thu Jul 11 11:18:53 2002 +++ subr_cluster.c Fri Feb 13 16:50:18 2004 @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2002 Network Storage Solutions, Inc. + * All rights reserved. + * + * Written by Chris M. Jepeway for Network Storage Solutions, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Network Storage Solutions, Inc. + * 4. The name of Network Storage Solutions, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY NETWORK STORAGE SOLUTIONS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NETWORK STORAGE SOLUTIONS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_KERNEL_OPT) +#include "opt_cluster.h" +#endif + +static struct vm_map *cluster_map = NULL; + +/* Max # of buffers we cluster together -- the default */ +/* setting is no more than MAXBSIZE worth of pages; its */ +/* dependency on PAGE_SIZE means that MAX_CLUSTERS is */ +/* not a constant */ +#ifndef MAX_CLUSTERS +#define MAX_CLUSTERS (MAXPHYS / PAGE_SIZE) +#endif + +/* Max amount of VA we'll ever allow to be mapped by clusters */ +#ifndef MAX_CLUSTEREDVA +#define MAX_CLUSTEREDVA (32 * 1024 * 1024) +#endif + +/* Make "constants" patchable */ +static int max_clusters = -1; +static int max_clusteredva = -1; + +#if defined(DIAGNOSTIC) || defined(CLUSTER_STATS) +/* Counters for determining how well our maxes are set */ +/* XXX - since interrupts cause these to be updated, we */ +/* either need to synchronize access to them or we ass- */ +/* ume they are updated atomically; take the latter, */ +/* easier approach for now */ +static unsigned long currentclusters; + +static unsigned long totalclusters; /* This might overflow, if we do > 135 */ + /* (or so) clusters a second for a year; */ + /* if we ever synchronize updates, make */ + /* it a long long or a double */ +static unsigned long totalclusteredbuffers; /* Total # of buffers */ + /* put into clusters */ +static unsigned long totalclusteredpages; /* Total # of pages */ + /* put into clusters */ +static unsigned long missedclusters;/* We hope this never overflows */ + + +#define CLUSTER_STAT(x) x + +#else +#define CLUSTER_STAT(x) +#endif + +#define b_clusterflags b_flags /* hide our flags here */ +#define B_CLUSTERFAILED B_XXX /* clustering failed */ + +#ifdef DIAGNOSTIC +#define BUFQ_REMOVE(bs, bp) \ + do { \ + struct buf *fbp; \ + fbp = BUFQ_GET(bs); \ + KASSERT(fbp == bp); \ + } while (0) +#else +#define BUFQ_REMOVE(bs, bp) (void) BUFQ_GET(bs) +#endif + + +void +cluster_init(void) +{ + paddr_t min, max; + + if (max_clusters < 0) + max_clusters = MAX_CLUSTERS; + if (max_clusteredva < 0) + max_clusteredva = MAX_CLUSTEREDVA; + + min = max = 0; + if (cluster_map == NULL) { + cluster_map = uvm_km_suballoc(kernel_map, + &min, &max, max_clusteredva, + VM_MAP_INTRSAFE, 0, NULL); + if (cluster_map == NULL) + panic("cluster_init"); + } + + CLUSTER_STAT(currentclusters = 0); + CLUSTER_STAT(totalclusters = 0); + CLUSTER_STAT(totalclusteredbuffers = 0); + CLUSTER_STAT(totalclusteredpages = 0); + CLUSTER_STAT(missedclusters = 0); +} + +/* + * Initialize a buf for use in clustering + */ +void +cluster_init_buf(struct buf *bp) +{ + bp->b_clusterflags &= ~B_CLUSTERFAILED; +} + +static void clusterdone(struct buf *); + + +typedef struct cluster { + struct bufq_state *bstate; + int nbufs; + struct buf *cluster[1]; +} cluster_t; + + + +/* + * Get the next cluster from the given buffer store + */ +struct buf * +cluster(struct bufq_state *bstate, u_int32_t secsz) +{ + cluster_t *cl; + struct buf *cbp, *bp, *bufs[2 * max_clusters], **bpp; + int i, n, off, s, nblks; + daddr_t top, bot; + int size; + int topi, boti; + long rw; + paddr_t p; + + KASSERT(max_clusters > 0); + KASSERT(max_clusteredva > 0); + + bp = BUFQ_PEEK(bstate); + if (bp == NULL) + return NULL; + + if (secsz == 0) + /* Must be reading disklabel */ + goto nocluster; + + s = splbio(); + cbp = pool_get(&bufpool, PR_NOWAIT); + splx(s); + if (cbp == NULL) + goto nocluster; + + n = 0; + size = 0; + rw = bp->b_flags & B_READ; + bot = top = bp->b_rawblkno; + boti = topi = max_clusters; + while (bp) { + /* + * XXX - we could map a page that's not full at the + * end of the cluster; instead, we only cluster + * buffers that are multiples of the page size + */ + if ((bp->b_flags & B_READ) != rw || + (vaddr_t) bp->b_data != trunc_page((vaddr_t) bp->b_data) || + bp->b_bcount != trunc_page(bp->b_bcount) || + size + bp->b_bcount > MAXBSIZE || + n == max_clusters || + bp->b_clusterflags & B_CLUSTERFAILED) + break; + + nblks = howmany(bp->b_bcount, secsz); + + if (bp->b_rawblkno == top) { + i = topi++; + top += nblks; + } else if (bp->b_rawblkno + nblks == bot) { + i = --boti; + bot -= nblks; + } else + break; + + n++; + BUFQ_REMOVE(bstate, bp); + bufs[i] = bp; + size += bp->b_bcount; + + bp = BUFQ_PEEK(bstate); + } + + switch (n) { + case 0: + BUFQ_REMOVE(bstate, bp); + /* U-G-L-Y */ + bufs[max_clusters] = bp; + /* Note fall through */ + case 1: + goto dropcluster; + } + + cl = malloc(sizeof(*cl) - sizeof(cl->cluster) + + n * sizeof(cl->cluster[0]), + M_DEVBUF, M_NOWAIT); + if (cl == NULL) + goto dropcluster; + + cbp->b_data = (caddr_t) uvm_km_kmemalloc(cluster_map, NULL, size, + UVM_KMF_VALLOC | + UVM_KMF_TRYLOCK); + if (cbp->b_data == 0) + goto freecluster; + cbp->b_bufsize = 0; + + cl->nbufs = n; + memcpy(cl->cluster, &bufs[boti], n * sizeof(bufs[0])); + + cl->bstate = bstate; + + bpp = cl->cluster; + off = 0; + bp = *bpp++; + while (cbp->b_bufsize < size) { + if (off >= bp->b_bcount) { + bp = *bpp++; + off = 0; + } + + /* + * Try to find page mapped by this offset. + */ + p = vtophys((vaddr_t) bp->b_data + off); + if (p == 0) + /* + * XXX - For now, drop everything; + * could instead continue w/ + * those mappings we've been + * able to establish + */ + goto dropmap; + + pmap_kenter_pa((vaddr_t) cbp->b_data + cbp->b_bufsize, p, + VM_PROT_READ | + (rw == B_READ) ? VM_PROT_WRITE : 0); + + off += PAGE_SIZE; + cbp->b_bufsize += PAGE_SIZE; + } + pmap_update(pmap_kernel()); + + cbp->b_resid = cbp->b_bcount = cbp->b_bufsize; + cbp->b_private = cl; + + cbp->b_vp = NULL; + cbp->b_rawblkno = bot; + cbp->b_blkno = cbp->b_lblkno = -1; + LIST_INIT(&cbp->b_dep); + + cbp->b_flags = B_BUSY | B_CALL | rw; + cbp->b_iodone = clusterdone; + CLUSTER_STAT(currentclusters++); + CLUSTER_STAT(totalclusters++); + CLUSTER_STAT(totalclusteredbuffers += cl->nbufs); + CLUSTER_STAT(totalclusteredpages += cbp->b_bcount >> PAGE_SHIFT); + + return cbp; + + dropmap: + if (cbp->b_bufsize) + pmap_kremove((vaddr_t) cbp->b_data, cbp->b_bufsize); + uvm_km_free_wakeup(cluster_map, (vaddr_t) cbp->b_data, size); + freecluster: + free(cl, M_DEVBUF); + CLUSTER_STAT(missedclusters++); + dropcluster: + cbp->b_flags = B_INVAL; +# if DIAGNOSTIC + cbp->b_private = (void *) 0xdeadbeef; + cbp->b_iodone = NULL; +# endif + s = splbio(); + pool_put(&bufpool, cbp); + splx(s); + for (i = boti; i < topi; i++) + if (i != max_clusters) + BUFQ_PUT(bstate, bufs[i]); + bp = bufs[max_clusters]; + return bp; + + nocluster: + BUFQ_REMOVE(bstate, bp); + return bp; +} + + +/* + * Should be called at splbio() + */ +static void +clusterdone(struct buf *cbp) +{ + struct buf *bp; + cluster_t *cl; + int i; + + pmap_kremove((vaddr_t) cbp->b_data, cbp->b_bufsize); + pmap_update(pmap_kernel()); + uvm_km_free_wakeup(cluster_map, + (vaddr_t) cbp->b_data, cbp->b_bufsize); + cbp->b_bufsize = 0; + + cl = (cluster_t *) cbp->b_private; +# if DIAGNOSTIC + if (cl == (void *) 0xdeadbeef) + /* WTF? */ + panic("cluster is deadbeef"); +# endif + + i = cl->nbufs; +# ifdef CLUSTER_DEBUG + printf("clusterdone: %p\t%d\n", cbp, i); +# endif + + while (i--) { + bp = cl->cluster[i]; + + /* + * We should never find a buf in a cluster that + * has failed a previous try at clustering + */ + KASSERT((bp->b_clusterflags & B_CLUSTERFAILED) == 0); + + if (cbp->b_flags & B_ERROR) { + /* + * Uncluster and re-try + */ + bp->b_clusterflags |= B_CLUSTERFAILED; + BUFQ_PUT(cl->bstate, bp); + } else { + bp->b_resid = min(cbp->b_resid, bp->b_bcount); + if (cbp->b_resid) { + cbp->b_resid -= bp->b_bcount; + if (cbp->b_resid < 0) + cbp->b_resid = 0; + } + + biodone(bp); + } + } + + cbp->b_flags = B_INVAL; +# if DIAGNOSTIC + cbp->b_private = (void *) 0xdeadbeef; + cbp->b_iodone = NULL; +# endif + + pool_put(&bufpool, cbp); + + free(cl, M_DEVBUF); + CLUSTER_STAT(currentclusters--); +} Index: sys/buf.h =================================================================== RCS file: /cvsroot/src/sys/sys/buf.h,v retrieving revision 1.71 diff -u -r1.71 buf.h --- sys/buf.h 2004/01/25 18:06:49 1.71 +++ sys/buf.h 2004/02/13 16:50:28 @@ -127,6 +127,10 @@ #define BUFQ_PEEK(bufq) \ (*(bufq)->bq_get)((bufq), 0) /* Get buffer from queue */ +void cluster_init __P((void)); +void cluster_init_buf __P((struct buf *)); +struct buf *cluster __P((struct bufq_state *, u_int32_t)); + #endif /* _KERNEL */ /*