/* $XConsortium: cir_colexp.c,v 1.3 95/01/05 20:47:57 kaleb Exp $ */
/* $XFree86: xc/programs/Xserver/hw/xfree86/vga256/drivers/cirrus/cir_colexp.c,v 3.8 1995/04/09 14:14:23 dawes Exp $ */
/*
 *
 * Copyright 1994 by H. Hanemaayer, Utrecht, The Netherlands
 *
 * Permission to use, copy, modify, distribute, and sell this software and its
 * documentation for any purpose is hereby granted without fee, provided that
 * the above copyright notice appear in all copies and that both that
 * copyright notice and this permission notice appear in supporting
 * documentation, and that the name of H. Hanemaayer not be used in
 * advertising or publicity pertaining to distribution of the software without
 * specific, written prior permission.  H. Hanemaayer makes no representations
 * about the suitability of this software for any purpose.  It is provided
 * "as is" without express or implied warranty.
 *
 * H. HANEMAAYER DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
 * EVENT SHALL H. HANEMAAYER BE LIABLE FOR ANY SPECIAL, INDIRECT OR
 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
 * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
 *
 * Author:  H. Hanemaayer, <hhanemaa@cs.ruu.nl>
 *
 */


/*
 * This file contains the low level accelerated functions that use color
 * expansion/extended write modes, which are supported on all chipsets.
 * Most functions divide the area into banking regions, taking advantage
 * of the 16K bank granularity, and have efficient inner loops without
 * bank checks. Tile and stipple fill draw in scanline interleaved order.
 *
 * Operations performed with these functions:
 * - Solid fill
 * - Transparent/Opaque 32 bits-wide stipple fill
 * - Multiple-of-8 wide tile fill
 * - Special case BitBlt (scrolling) for the 5420/2/4
 * - Small size plain framebuffer fill/BitBlt.
 *
 */
 

#include "X.h"
#include "Xmd.h"
#include "servermd.h"
#include "gcstruct.h"
#include "window.h"
#include "pixmapstr.h"
#include "scrnintstr.h"
#include "windowstr.h"

#include "cfb.h"
#include "cfbmskbits.h"
#include "cfbrrop.h"
#include "mergerop.h"
#include "xf86.h"
#include "vgaBank.h"
#include "vga.h"	/* For vgaInfoRec. */
#include "xf86_HWlib.h"

#ifndef __GNUC__
#undef __volatile__
#define __volatile__ volatile
#endif

#include "compiler.h"


extern pointer vgaBase;

#include "cir_driver.h"
#include "cir_inline.h"
#include "cir_span.h"


/* Table with bit-reversed equivalent for each possible byte. */

unsigned char byte_reversed[256] = {
	0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,
	0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0,
	0x08,0x88,0x48,0xc8,0x28,0xa8,0x68,0xe8,
	0x18,0x98,0x58,0xd8,0x38,0xb8,0x78,0xf8,
	0x04,0x84,0x44,0xc4,0x24,0xa4,0x64,0xe4,
	0x14,0x94,0x54,0xd4,0x34,0xb4,0x74,0xf4,
	0x0c,0x8c,0x4c,0xcc,0x2c,0xac,0x6c,0xec,
	0x1c,0x9c,0x5c,0xdc,0x3c,0xbc,0x7c,0xfc,
	0x02,0x82,0x42,0xc2,0x22,0xa2,0x62,0xe2,
	0x12,0x92,0x52,0xd2,0x32,0xb2,0x72,0xf2,
	0x0a,0x8a,0x4a,0xca,0x2a,0xaa,0x6a,0xea,
	0x1a,0x9a,0x5a,0xda,0x3a,0xba,0x7a,0xfa,
	0x06,0x86,0x46,0xc6,0x26,0xa6,0x66,0xe6,
	0x16,0x96,0x56,0xd6,0x36,0xb6,0x76,0xf6,
	0x0e,0x8e,0x4e,0xce,0x2e,0xae,0x6e,0xee,
	0x1e,0x9e,0x5e,0xde,0x3e,0xbe,0x7e,0xfe,
	0x01,0x81,0x41,0xc1,0x21,0xa1,0x61,0xe1,
	0x11,0x91,0x51,0xd1,0x31,0xb1,0x71,0xf1,
	0x09,0x89,0x49,0xc9,0x29,0xa9,0x69,0xe9,
	0x19,0x99,0x59,0xd9,0x39,0xb9,0x79,0xf9,
	0x05,0x85,0x45,0xc5,0x25,0xa5,0x65,0xe5,
	0x15,0x95,0x55,0xd5,0x35,0xb5,0x75,0xf5,
	0x0d,0x8d,0x4d,0xcd,0x2d,0xad,0x6d,0xed,
	0x1d,0x9d,0x5d,0xdd,0x3d,0xbd,0x7d,0xfd,
	0x03,0x83,0x43,0xc3,0x23,0xa3,0x63,0xe3,
	0x13,0x93,0x53,0xd3,0x33,0xb3,0x73,0xf3,
	0x0b,0x8b,0x4b,0xcb,0x2b,0xab,0x6b,0xeb,
	0x1b,0x9b,0x5b,0xdb,0x3b,0xbb,0x7b,0xfb,
	0x07,0x87,0x47,0xc7,0x27,0xa7,0x67,0xe7,
	0x17,0x97,0x57,0xd7,0x37,0xb7,0x77,0xf7,
	0x0f,0x8f,0x4f,0xcf,0x2f,0xaf,0x6f,0xef,
	0x1f,0x9f,0x5f,0xdf,0x3f,0xbf,0x7f,0xff,
};

/* Bit masks for left edge (indexed with first left-to-right bit number). */

static unsigned char leftbitmask[8] = {
	0xff, 0x7f, 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01
};

/* Bit masks for right edge (indexed with number of pixels left). */

static unsigned char rightbitmask[9] = {
	0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff
};


/*
 * This is the special function for small widths (< 32).
 * Just uses the banked plain framebuffer.
 * It is somewhat faster than the cfb.banked equivalents (it takes
 * advantage of the 16K bank granularity).
 *
 * Currently not used because of bug (see xgc time percentage scrollbar).
 */

#ifdef __STDC__
void Cirrus32bitFillSmall( int x, int y, int w, int h,
unsigned long *bits_in, int sh, int sox, int soy, int bg, int fg,
int destpitch )
#else
void Cirrus32bitFillSmall( x, y, w, h, bits_in, sh, sox, soy, bg, fg,
destpitch )
	int x, y, w, h;
	unsigned long *bits_in;
	int sh, sox, soy, bg, fg, destpitch;
#endif
{
	int j;
	int destaddr;
	unsigned char *destp;
	int syindex;
	int bank;
	unsigned char *base;	/* Video window base address. */
	unsigned char color[2];

	base = CIRRUSWRITEBASE();	/* Write window. */
	destaddr = y * destpitch + x;

	CIRRUSSETWRITEB(destaddr, bank);

	color[0] = bg;
	color[1] = fg;

	syindex = (y - soy) % sh;	/* y index into source bitmap. */

	for (j = 0; j < h; j++) {
		unsigned long bits;
		int count;
		
		bits = rotateleft(32 - (sox & 31), bits_in[syindex]);

		CIRRUSCHECKWRITEB(destaddr, bank);
		destp = base + destaddr;

		count = w;
		if (bits == 0xffffffff)
			__memset(destp, fg, w);
		else {
			while (count > 8) {
				*destp = color[bits & 1]; bits >>= 1;
				*(destp + 1) = color[bits & 1]; bits >>= 1;
				*(destp + 2) = color[bits & 1]; bits >>= 1;
				*(destp + 3) = color[bits & 1]; bits >>= 1;
				*(destp + 4) = color[bits & 1]; bits >>= 1;
				*(destp + 5) = color[bits & 1]; bits >>= 1;
				*(destp + 6) = color[bits & 1]; bits >>= 1;
				*(destp + 7) = color[bits & 1]; bits >>= 1;
				destp += 8;
				count -= 8;
			}
			while (count > 0) {
				*destp = color[bits & 1];
				bits >>= 1;
				destp++;
				count--;
			}
		}

		destaddr += destpitch;

		syindex++;
		if (syindex == sh)
			syindex = 0;	/* Wrap pattern vertically. */
	}
}


/*
 * This is a bitblt function. It takes advantage of the 8 data latches that
 * can be enabled in BY8 addressing mode to do efficient vertical bitblts.
 * I believe this makes scrolling bearable on the chips that don't
 * have the bitblt engine, i.e. <= 5424. On a local bus, it may even
 * rival the bitblt engine in speed.
 *
 * Arguments:
 * x1, y1	Coordinates of source area.
 * x2, y2	Coordinates of destination area.
 * w, h		Size of area to be copied.
 * destpitch	Scanline width of screen in bytes.
 *
 * x1 must be equal to x2 (actually works if (x1 % 8 == x2 % 8)).
 * Copies from top to bottom. For overlapping areas, correct if
 * (y1 > y2 || (y1 == y2 && x1 > x2).
 *
 * The data latches work similar to VGA write mode 1 (for planar modes).
 * Basically, a read from display memory fills the 8 latches with 8 pixels,
 * and subsequent writes (CPU data written doesn't matter) will each write
 * the 8 pixels stored in the latches.
 *
 * This new version divides into unbanked regions, with efficient assembler
 * inner loop; the left and right edge are drawn seperately within each
 * region.
 *
 */

#ifdef AVOID_ASM_ROUTINES

static void CirrusLatchCopySpans(srcp, destp, bcount, n, destpitch)
	unsigned char *srcp;
	unsigned char *destp;
	int bcount;
	int n;
	int destpitch;
{
	int i;
	for (i = 0; i < n; i++) {
		int j;
		for (j = 0; j < bcount; j++)
			*(destp + j) = *(srcp + j);
		srcp += destpitch;
		destp += destpitch;
	}
}

#endif

void CirrusLatchedBitBlt(x1, y1, x2, y2, w, h, destpitch)
	int x1, y1, x2, y2, w, h, destpitch;
{
	int j;
	int destaddr, srcaddr;
	unsigned char *destp, *srcp;
	int writebank, readbank;
	int bitoffset;
	int nspans, bcount, leftbyte;

	unsigned char *readbase;	/* Video read window base address. */
	unsigned char *writebase;	/* Video write window */

	readbase = CIRRUSREADBASE();
	writebase = CIRRUSWRITEBASE();

	destaddr = y2 * destpitch + x2;
	srcaddr = y1 * destpitch + x1;

	/* Enable extended write modes, BY8 addressing, and 8 byte data */
	/* latches. Every addressing byte corresponds to 8 pixels. */
	if (cirrusUseLinear) {
		SETMODEEXTENSIONS(EXTENDEDWRITEMODES | BY8ADDRESSING |
			EIGHTDATALATCHES);
	}
	else {
		SETMODEEXTENSIONS(EXTENDEDWRITEMODES | BY8ADDRESSING |
			EIGHTDATALATCHES | DOUBLEBANKED);
	}

	SETWRITEMODE(1);

	SETPIXELMASK(0xff);
	SETFOREGROUNDCOLOR(0);		/* Disable set/reset. */

	/* Bitmask offset of leftmost byte (group of 8 pixels). */
	bitoffset = destaddr & 7;
	leftbyte = bitoffset == 0 ? 0 : 1;
	/* Number of full bytes (groups of 8 pixels). */
	bcount = bitoffset == 0 ? w / 8 : (w - (8 - bitoffset)) / 8;

	destaddr >>= 3;			/* Divide address by 8. */
	srcaddr >>= 3;

	nspans = h;	/* Number of spans to go. */

	CIRRUSSETREADB(srcaddr, readbank);
	CIRRUSSETWRITEB(destaddr, writebank);

	for (;;) {
		int nread, nwrite, n;
		/* Calculate how many scanlines fit in the banking region. */
		nread = CIRRUSWRITEREGIONLINES(srcaddr, destpitch >> 3);
		nwrite = CIRRUSWRITEREGIONLINES(destaddr, destpitch >> 3);
		n = nread;
		if (nwrite < n)
			n = nwrite;
		if (n > nspans)
			n = nspans;
		nspans -= n;

		/* Do first byte (left edge). */
		if (bitoffset != 0) {
			int i;
			SETPIXELMASK(leftbitmask[bitoffset]);
			srcp = readbase + srcaddr;
			destp = writebase + destaddr;
			for (i = 0; i < n; i++) {
				/* Write mode 1 latch read/write. */
				*destp = *srcp;
				srcp += destpitch >> 3;
				destp += destpitch >> 3;
			}
			SETPIXELMASK(0xff);
		}

		CirrusLatchCopySpans(
			readbase + srcaddr + leftbyte,
			writebase + destaddr + leftbyte,
			bcount,
			n,
			destpitch >> 3
			);

		/* Do last byte (right edge). */
		if (((x1 + w) & 7) > 0) {
			int i;
			SETPIXELMASK(rightbitmask[(x1 + w) & 7]);
			srcp = readbase + srcaddr + leftbyte + bcount;
			destp = writebase + destaddr + leftbyte + bcount;
			for (i = 0; i < n; i++) {
				/* Write mode 1 latch read/write. */
				*destp = *srcp;
				srcp += destpitch >> 3;
				destp += destpitch >> 3;
			}
			SETPIXELMASK(0xff);
		}

		if (nspans == 0)
			break;
		srcaddr += n * (destpitch >> 3);
		destaddr += n * (destpitch >> 3);
		CIRRUSCHECKREADB(srcaddr, readbank);
		CIRRUSCHECKWRITEB(destaddr, writebank);
	}

	/* Disable extended write modes and BY8 addressing. */
	if (cirrusUseLinear) {
		SETMODEEXTENSIONS(SINGLEBANKED);
	}
	else {
		SETMODEEXTENSIONS(DOUBLEBANKED);
	}
	SETWRITEMODE(0);
	SETFOREGROUNDCOLOR(0x00);	/* Disable set/reset. */
}


void CirrusLatchedBitBltReversed(x1, y1, x2, y2, w, h, destpitch)
	int x1, y1, x2, y2, w, h, destpitch;
{
	int j;
	int destaddr, srcaddr;
	unsigned char *destp, *srcp;
	int writebank, readbank;
	int bitoffset;
	int nspans, bcount, leftbyte;

	unsigned char *readbase;	/* Video read window base address. */
	unsigned char *writebase;	/* Video write window */

	readbase = CIRRUSREADBASE();
	writebase = CIRRUSWRITEBASE();

	destaddr = (y2 + h - 1) * destpitch + x2;
	srcaddr = (y1 + h - 1) * destpitch + x1;

	/* Enable extended write modes, BY8 addressing, and 8 byte data */
	/* latches. Every addressing byte corresponds to 8 pixels. */
	if (cirrusUseLinear) {
		SETMODEEXTENSIONS(EXTENDEDWRITEMODES | BY8ADDRESSING |
			EIGHTDATALATCHES);
	}
	else {
		SETMODEEXTENSIONS(EXTENDEDWRITEMODES | BY8ADDRESSING |
			EIGHTDATALATCHES | DOUBLEBANKED);
	}

	SETWRITEMODE(1);

	SETPIXELMASK(0xff);
	SETFOREGROUNDCOLOR(0);		/* Disable set/reset. */

	/* Bitmask offset of leftmost byte (group of 8 pixels). */
	bitoffset = destaddr & 7;
	leftbyte = bitoffset == 0 ? 0 : 1;
	/* Number of full bytes (groups of 8 pixels). */
	bcount = bitoffset == 0 ? w / 8 : (w - (8 - bitoffset)) / 8;

	destaddr >>= 3;			/* Divide address by 8. */
	srcaddr >>= 3;

	nspans = h;	/* Number of spans to go. */

	CIRRUSSETREADB(srcaddr, readbank);
	CIRRUSSETWRITEB(destaddr, writebank);

	for (;;) {
		int nread, nwrite, n;

		/* Adjust bank regions. */
		CIRRUSCHECKREVERSEDREADB(srcaddr, readbank, destpitch >> 3);
		CIRRUSCHECKREVERSEDWRITEB(destaddr, writebank, destpitch >> 3);

		/* Calculate how many scanlines fit in the banking region. */
		nread = CIRRUSREVERSEDWRITEREGIONLINES(srcaddr, destpitch >> 3);
		nwrite = CIRRUSREVERSEDWRITEREGIONLINES(destaddr, destpitch >> 3);
		n = nread;
		if (nwrite < n)
			n = nwrite;
		if (n > nspans)
			n = nspans;
		nspans -= n;

		/* Do first byte (left edge). */
		if (bitoffset != 0) {
			int i;
			SETPIXELMASK(leftbitmask[bitoffset]);
			srcp = readbase + srcaddr;
			destp = writebase + destaddr;
			for (i = 0; i < n; i++) {
				/* Write mode 1 latch read/write. */
				*destp = *srcp;
				srcp -= destpitch >> 3;
				destp -= destpitch >> 3;
			}
			SETPIXELMASK(0xff);
		}

		CirrusLatchCopySpans(
			readbase + srcaddr + leftbyte,
			writebase + destaddr + leftbyte,
			bcount,
			n,
			- (destpitch >> 3)
			);

		/* Do last byte (right edge). */
		if (((x1 + w) & 7) > 0) {
			int i;
			SETPIXELMASK(rightbitmask[(x1 + w) & 7]);
			srcp = readbase + srcaddr + leftbyte + bcount;
			destp = writebase + destaddr + leftbyte + bcount;
			for (i = 0; i < n; i++) {
				/* Write mode 1 latch read/write. */
				*destp = *srcp;
				srcp -= destpitch >> 3;
				destp -= destpitch >> 3;
			}
			SETPIXELMASK(0xff);
		}

		if (nspans == 0)
			break;
		srcaddr -= n * (destpitch >> 3);
		destaddr -= n * (destpitch >> 3);
	}

	/* Disable extended write modes and BY8 addressing. */
	if (cirrusUseLinear) {
		SETMODEEXTENSIONS(SINGLEBANKED);
	}
	else {
		SETMODEEXTENSIONS(DOUBLEBANKED);
	}
	SETWRITEMODE(0);
	SETFOREGROUNDCOLOR(0x00);	/* Disable set/reset. */
}


/*
 * Conventional framebuffer bitblt; no (x1 & 7 == x2 & 7) restriction.
 */

#ifdef __STDC__
void CirrusSimpleBitBlt( int x1, int y1, int x2, int y2, int w, int h,
int destpitch )
#else
void CirrusSimpleBitBlt( x1, y1, x2, y2, w, h, destpitch )
	int x1, y1, x2, y2, w, h, destpitch;
#endif
{
	int j;
	int destaddr, srcaddr;
	unsigned char *destp, *srcp;
	int writebank, readbank;
	int bitoffset;
	int syindex;
	unsigned char *readbase;	/* Video read window base address. */
	unsigned char *writebase;	/* Video write window */
	int saveGRB;

	readbase = CIRRUSREADBASE();
	writebase = CIRRUSWRITEBASE();

	destaddr = y2 * destpitch + x2;
	srcaddr = y1 * destpitch + x1;

	CIRRUSSETREADB(srcaddr, readbank);
	CIRRUSSETWRITEB(destaddr, writebank);

	for (j = 0; j < h; j++) {
		CIRRUSCHECKREADB(srcaddr, readbank);
		CIRRUSCHECKWRITEB(destaddr, writebank);

		/* Address in write window. */
		destp = writebase + destaddr;
		/* Address in read window. */
		srcp = readbase + srcaddr;

		__memcpy(destp, srcp, w);

		destaddr += destpitch;
		srcaddr += destpitch;
	}
}


/*
 * Optimized solid fill.
 * Divides area in regions within banks.
 * Uses the full 64K window in BY8 addressing mode, giving effective
 * bank regions of 512K.
 * w >= 32.
 */

#ifdef AVOID_ASM_ROUTINES

static void CirrusColorExpandWriteSpans(destp, leftmask, leftbcount,
midlcount, rightbcount, rightmask, h, destpitch)
	unsigned char *destp;
	int leftmask, leftbcount, midlcount, rightbcount, rightmask;
	int h;
	int destpitch;
{
	while (h > 0) {
		unsigned char *destpsave;
		int i;
		destpsave = destp;
		if (leftbcount > 0) {
			*destp = leftmask;
			destp++;
			for (i = 0; i < leftbcount - 1; i++) {
				*destp = 0xff;
				destp++;
			}
		}
		for (i = 0; i < midlcount; i++) {
			*(unsigned long *)destp = 0xffffffff;
			destp += 4;
		}
		if (rightbcount > 1) 
			for (i = 0; i < rightbcount - 1; i++) {
				*destp = 0xff;
				destp++;
			}
		if (rightbcount > 0)
			*destp = rightmask;
		destp = destpsave + destpitch;
		h--;
	}
}

#endif

void CirrusColorExpandSolidFill(x, y, w, h, fg, destpitch)
	int x, y, w, h, fg, destpitch;
{
	int destaddr;
	int bank;
	unsigned char *base;	/* Video window base address. */
	int nspans, bitoffset;
	int leftmask, leftbcount, midlcount, rightbcount, rightmask;

	base = CIRRUSSINGLEBASE();		/* Single Read/Write window. */
	destaddr = y * destpitch + x;

	/* Enable extended write modes and BY8 addressing. */
	/* Every addressing byte corresponds to 8 pixels. */
	SETMODEEXTENSIONS(EXTENDEDWRITEMODES | BY8ADDRESSING | SINGLEBANKED);

	SETWRITEMODE(4);

	SETPIXELMASK(0xff);
	SETFOREGROUNDCOLOR(fg);

	/* Calculate masks and counts. */
	/* Bit offset of leftmost pixel of area to be filled. */
	bitoffset = destaddr & 7;
	destaddr >>= 3;			/* Divide address by 8. */
	leftmask = leftbitmask[bitoffset];
	w -= 8 - bitoffset;
	if (w < 0) {
		/* Just one byte. */
		leftmask &= rightbitmask[w + 8];
		leftbcount = 1;
		midlcount = 0;
		rightbcount = 0;
		goto masksdone;
	}
	leftbcount = 4 - (destaddr & 3);
	if ((leftbcount - 1) * 8 > w)
		leftbcount = w / 8 + 1;
	w -= (leftbcount - 1) * 8;
	midlcount = w / 32;
	w &= 31;
	rightbcount = (w + 7) / 8;
	if (rightbcount > 0)
		rightmask = rightbitmask[w - (rightbcount - 1) * 8];
masksdone:

	CIRRUSSETSINGLEB(destaddr, bank);

	nspans = h;	/* Number of spans to go. */
	for (;;) {
		int n;
		/* Calculate how many scanlines fit in this banking region. */
		n = CIRRUSSINGLEREGIONLINES(destaddr, destpitch >> 3);
		if (n > nspans)
			n = nspans;
		nspans -= n;

		CirrusColorExpandWriteSpans(base + destaddr,
			leftmask, leftbcount, midlcount, rightbcount,
			rightmask, n, destpitch >> 3);

		if (nspans == 0)
			break;
		destaddr += n * (destpitch >> 3);
		CIRRUSCHECKSINGLEB(destaddr, bank);
	}

	/* Disable extended write modes and BY8 addressing. */
	if (cirrusUseLinear) {
		SETMODEEXTENSIONS(SINGLEBANKED);
	}
	else {
		SETMODEEXTENSIONS(DOUBLEBANKED);
	}
	SETWRITEMODE(0);
	SETFOREGROUNDCOLOR(0x00);	/* Disable set/reset. */
}


/*
 * Optimized stipple, dividing into unbanked regions.
 * Scanlines are written interleaved, maximizing consecutive writes of the
 * same stipple bits word. Assumes a virtual screen width that is a multiple
 * 32 (otherwise video memory access will be largely unaligned).
 */

#ifdef AVOID_ASM_ROUTINES

static void CirrusColorExpandWriteStippleSpans(destp, leftmask, leftbcount,
midlcount, rightbcount, stipplerightmask, h, stippleword, destpitch)
	unsigned char *destp;
	int stippleleftmask, leftbcount, midlcount, rightbcount;
	int stipplerightmask;
	int h;
	unsigned long stippleword;
	int destpitch;
{
	while (h > 0) {
		unsigned char *destpsave;
		int i;
		destpsave = destp;
		switch (leftbcount) {
		case 1 :
			*destp = stippleleftmask;
			destp++;
			break;
		case 2 :
			*destp = stippleleftmask;
			*(destp + 1) = stippleword >> 24;
			destp += 2;
			break;
		case 3 :
			*destp = stippleleftmask;
			*(destp + 1) = stippleword >> 16;
			*(destp + 2) = stippleword >> 24;
			destp + = 3;
			break;
		case 4 :
			*destp = stippleleftmask;
			*(destp + 1) = stippleword >> 8;
			*(unsigned short *)(destp + 2) = stippleword >> 16;
			destp += 4;
			break;
		}
		for (i = 0; i < midlcount; i++) {
			*(unsigned long *)destp = stippleword;
			destp += 4;
		}
		switch (rightbcount) {
		case 1 :
			*destp = stipplerightmask;
			break;
		case 2 :
			*destp = stippleword;
			*(destp + 1) = stipplerightmask;
			break;
		case 3 :
			*(unsigned short *)destp = stippleword;
			*(destp + 2) = stipplerightmask;
			break;
		case 4 :
			*(unsigned short *)destp = stippleword;
			*(destp + 2) = stippleword >> 16;
			*(destp + 3) = stipplerightmask;
			break;
		}
		destp = destpsave + destpitch;
		h--;
	}
}

#endif

void CirrusColorExpandStippleFill(x, y, w, h, bits_in, sh, sox, soy, fg,
destpitch)
	int x, y, w, h;
	unsigned long *bits_in;
	int sh, sox, soy, fg, destpitch;
{
	int destaddr;
	int bank;
	unsigned char *base;	/* Video window base address. */
	int nspans, bitoffset;
	int leftmask, leftbcount, midlcount, rightbcount, rightmask;
	int bytealignment;

	unsigned long *new_bits;
	int i, syindex;

	/* Reverse per-byte bit order of the stipple. */
	new_bits = (unsigned long *)ALLOCATE_LOCAL(sh * 4);
	for (i = 0; i < sh; i++) {
		if (bits_in[i] == 0xffffffff)
			new_bits[i] = 0xffffffff;
		else {
			unsigned long bits;
			/* Rotate so that data is correctly aligned to */
			/* origin for writing dwords to framebuffer. */
			bits = rotateleft(32 - (sox & 31), bits_in[i]);
			/* Reverse each of the four bytes. */
			((unsigned char *)new_bits)[i * 4] =
				byte_reversed[(unsigned char)bits];
			((unsigned char *)new_bits)[i * 4 + 1] =
				byte_reversed[(unsigned char)(bits >> 8)];
			((unsigned char *)new_bits)[i * 4 + 2] =
				byte_reversed[(unsigned char)(bits >> 16)];
			((unsigned char *)new_bits)[i * 4 + 3] =
				byte_reversed[(unsigned char)(bits >> 24)];
		}
	}

	base = CIRRUSSINGLEBASE();	/* Read/write window. */
	destaddr = y * destpitch + x;

	/* Enable extended write modes and BY8 addressing. */
	/* Every addressing byte corresponds to 8 pixels. */
	SETMODEEXTENSIONS(EXTENDEDWRITEMODES | BY8ADDRESSING | SINGLEBANKED);

	SETWRITEMODE(4);

	SETPIXELMASK(0xff);
	SETFOREGROUNDCOLOR(fg);

	/* Calculate masks and counts. */
	/* Bit offset of leftmost pixel of area to be filled. */
	bitoffset = destaddr & 7;
	destaddr >>= 3;			/* Divide address by 8. */
	leftmask = leftbitmask[bitoffset];
	w -= 8 - bitoffset;
	bytealignment = (x >> 3) & 3;
	if (w < 0) {
		/* Just one byte. */
		leftmask &= rightbitmask[w + 8];
		leftbcount = 1;
		midlcount = 0;
		rightbcount = 0;
		goto masksdone;
	}
	leftbcount = 4 - bytealignment;
	if ((leftbcount - 1) * 8 > w)
		/* If the area falls within a 32-bit word, leftbcount does */
		/* not indicate the byte alignment. */
		leftbcount = w / 8 + 1;
	w -= (leftbcount - 1) * 8;
	midlcount = w / 32;
	w &= 31;
	rightbcount = (w + 7) / 8;
	if (rightbcount > 0)
		rightmask = rightbitmask[w - (rightbcount - 1) * 8];
masksdone:

	CIRRUSSETSINGLEB(destaddr, bank);

	syindex = (y - soy) % sh;	/* y index into source bitmap. */

	nspans = h;	/* Number of spans to go. */

	for (;;) {
		int n, minlines, oneextralimit, startsyindex;
		unsigned char *destp;

		/* Calculate how many scanlines fit in this banking region. */
		n = CIRRUSSINGLEREGIONLINES(destaddr, destpitch >> 3);
		if (n > nspans)
			n = nspans;
		nspans -= n;

		destp = base + destaddr;
		startsyindex = syindex;
		minlines = n / sh;
		oneextralimit = n % sh;
		for (i = 0; i < sh && i < n; i++) {
			int linecount;
			int stippleleftmask, stipplerightmask;
			union {
				unsigned long dword;
				unsigned char byte[4];
			} stipplebits;
			stipplebits.dword = new_bits[syindex];

			if (leftbcount > 0)
				stippleleftmask = leftmask &
					stipplebits.byte[bytealignment];
			if (rightbcount > 0)
				stipplerightmask = rightmask &
					stipplebits.byte[
						(bytealignment + leftbcount +
						rightbcount - 1) & 3];
			linecount = minlines;
			if (i < oneextralimit)
				linecount++;

			CirrusColorExpandWriteStippleSpans(destp,
				stippleleftmask, leftbcount, midlcount,
				rightbcount, stipplerightmask, linecount,
				(destpitch >> 3) * sh, stipplebits.dword);

			destp += (destpitch >> 3);
			syindex++;
			if (syindex >= sh)
				syindex = 0;
		}
		syindex = startsyindex + oneextralimit;
		if (syindex >= sh)
			syindex -= sh;

		if (nspans == 0)
			break;
		destaddr += n * (destpitch >> 3);
		CIRRUSCHECKSINGLEB(destaddr, bank);
	}

	/* Disable extended write modes and BY8 addressing. */
	if (cirrusUseLinear) {
		SETMODEEXTENSIONS(SINGLEBANKED);
	}
	else {
		SETMODEEXTENSIONS(DOUBLEBANKED);
	}
	SETWRITEMODE(0);
	SETFOREGROUNDCOLOR(0x00);	/* Disable set/reset. */

	DEALLOCATE_LOCAL(new_bits);
}


/*
 * Optimized opaque stipple.
 * Within unbanked regions, first draws left edge, then middle, then right
 * edge.
 */

void CirrusColorExpandOpaqueStippleFill(x, y, w, h, bits_in, sh, sox, soy,
bg, fg, destpitch)
	int x, y, w, h;
	unsigned long *bits_in;
	int sh, sox, soy, fg, destpitch;
{
	int destaddr;
	int bank;
	unsigned char *base;	/* Video window base address. */
	int nspans, bitoffset;
	int leftmask, leftbcount, midlcount, rightbcount, rightmask;
	int bytealignment;

	unsigned long *new_bits;
	int i, syindex;
	int startsyindex;

	/* Reverse per-byte bit order of the stipple. */
	new_bits = (unsigned long *)ALLOCATE_LOCAL(sh * 4);
	for (i = 0; i < sh; i++) {
		if (bits_in[i] == 0xffffffff)
			new_bits[i] = 0xffffffff;
		else {
			unsigned long bits;
			/* Rotate so that data is correctly aligned to */
			/* origin for writing dwords to framebuffer. */
			bits = rotateleft(32 - (sox & 31), bits_in[i]);
			/* Reverse each of the four bytes. */
			((unsigned char *)new_bits)[i * 4] =
				byte_reversed[(unsigned char)bits];
			((unsigned char *)new_bits)[i * 4 + 1] =
				byte_reversed[(unsigned char)(bits >> 8)];
			((unsigned char *)new_bits)[i * 4 + 2] =
				byte_reversed[(unsigned char)(bits >> 16)];
			((unsigned char *)new_bits)[i * 4 + 3] =
				byte_reversed[(unsigned char)(bits >> 24)];
		}
	}

	base = CIRRUSSINGLEBASE();		/* Read/write window. */
	destaddr = y * destpitch + x;

	/* Enable extended write modes and BY8 addressing. */
	/* Every addressing byte corresponds to 8 pixels. */
	SETMODEEXTENSIONS(EXTENDEDWRITEMODES | BY8ADDRESSING | SINGLEBANKED);

	SETWRITEMODE(5);	/* Opaque. */

	SETPIXELMASK(0xff);
	SETFOREGROUNDCOLOR(fg);
	SETBACKGROUNDCOLOR(bg);

	/* Calculate masks and counts. */
	/* Bit offset of leftmost pixel of area to be filled. */
	bitoffset = destaddr & 7;
	destaddr >>= 3;			/* Divide address by 8. */
	leftmask = leftbitmask[bitoffset];
	w -= 8 - bitoffset;
	bytealignment = (x >> 3) & 3;
	if (w < 0) {
		/* Just one byte. */
		leftmask &= rightbitmask[w + 8];
		leftbcount = 1;
		midlcount = 0;
		rightbcount = 0;
		goto masksdone;
	}
	leftbcount = 4 - bytealignment;
	if ((leftbcount - 1) * 8 > w)
		/* If the area falls within a 32-bit word, leftbcount does */
		/* not indicate the byte alignment. */
		leftbcount = w / 8 + 1;
	w -= (leftbcount - 1) * 8;
	midlcount = w / 32;
	w &= 31;
	rightbcount = (w + 7) / 8;
	if (rightbcount > 0)
		rightmask = rightbitmask[w - (rightbcount - 1) * 8];
masksdone:

	CIRRUSSETSINGLEB(destaddr, bank);

	syindex = (y - soy) % sh;	/* y index into source bitmap. */

	nspans = h;	/* Number of spans to go. */

	for (;;) {
		int n, minlines, oneextralimit;
		unsigned char *destp;

		/* Calculate how many scanlines fit in this banking region. */
		n = CIRRUSSINGLEREGIONLINES(destaddr, destpitch >> 3);
		if (n > nspans)
			n = nspans;
		nspans -= n;

		minlines = n / sh;
		oneextralimit = n % sh;

/* Left edge. */ 
		SETPIXELMASK(leftmask);
		destp = base + destaddr;
		startsyindex = syindex;

		for (i = 0; i < n; i++) {
			int linecount;
			union {
				unsigned long dword;
				unsigned char byte[4];
			} stipplebits;
			stipplebits.dword = new_bits[syindex];

			*destp = stipplebits.byte[bytealignment];

			destp += (destpitch >> 3);
			syindex++;
			if (syindex >= sh)
				syindex = 0;
		}

/* Middle part */
		if (midlcount == 0 && leftbcount <= 1 && rightbcount <= 1)
			goto skipmiddlepart;
		SETPIXELMASK(0xff);
		destp = base + destaddr + 1;
		syindex = startsyindex;

		for (i = 0; i < sh && i < n; i++) {
			int linecount;
			union {
				unsigned long dword;
				unsigned char byte[4];
			} stipplebits;
			stipplebits.dword = new_bits[syindex];

			linecount = minlines;
			if (i < oneextralimit)
				linecount++;

			CirrusColorExpandWriteStippleSpans(destp,
				stipplebits.byte[(bytealignment + 1) & 3],
				leftbcount - 1, midlcount,
				(rightbcount == 0 ? 0 : rightbcount - 1),
				stipplebits.byte[(bytealignment + leftbcount
					+ rightbcount - 2) & 3],
				linecount, (destpitch >> 3) * sh,
				stipplebits.dword);

			destp += (destpitch >> 3);
			syindex++;
			if (syindex >= sh)
				syindex = 0;
		}
skipmiddlepart:

/* Right edge */
		if (rightbcount == 0)
			goto skiprightpart;
		SETPIXELMASK(rightmask);
		destp = base + destaddr + leftbcount + midlcount * 4 +
			rightbcount - 1;
		syindex = startsyindex;
		for (i = 0; i < n; i++) {
			int linecount;
			union {
				unsigned long dword;
				unsigned char byte[4];
			} stipplebits;
			stipplebits.dword = new_bits[syindex];

			*destp = stipplebits.byte[(bytealignment + leftbcount
				+ rightbcount - 1) & 3];
			destp += (destpitch >> 3);
			syindex++;
			if (syindex >= sh)
				syindex = 0;
		}
skiprightpart:

		syindex = startsyindex + oneextralimit;
		if (syindex >= sh)
			syindex -= sh;

		if (nspans == 0)
			break;
		destaddr += n * (destpitch >> 3);
		CIRRUSCHECKSINGLEB(destaddr, bank);
	}

	/* Disable extended write modes and BY8 addressing. */
	if (cirrusUseLinear) {
		SETMODEEXTENSIONS(SINGLEBANKED);
	}
	else {
		SETMODEEXTENSIONS(DOUBLEBANKED);
	}
	SETWRITEMODE(0);
	SETFOREGROUNDCOLOR(0x00);	/* Disable set/reset. */
	SETPIXELMASK(0xff);

	DEALLOCATE_LOCAL(new_bits);
}


/*
 * This function uses the 8 data latches for fast multiple-of-8 pixel wide
 * tile fill. Divides into banking regions.
 * Supports tilewidths of 8, 16, 24, 32, 40, 48, 56 and 64.
 *
 * Arguments:
 * x, y		Coordinates of the destination area.
 * w, h		Size of the area.
 * vtileaddr	Offset in video memory of tile.
 * tpitch	Width of a tile line in bytes.
 * tbytes	Width of tile / 8.
 * theight	Height of tile.
 * toy		Tile y-origin.
 *
 * w must be >= 32.
 * Tile must be aligned to start of scanline.
 */

#ifdef AVOID_ASM_ROUTINES

static void CirrusLatchWriteTileSpans(destp, count, tbytes, linecount, vpitch)
	unsigned char *destp;
	int tbytes, count, linecount, vpitch;
{
	int i;
	for (i = 0; i < linecount; i++) {
		int j;
		for (j = 0; j < count; j++)
			*(destp + j * tbytes) = 0;
		destp += vpitch;
	}
}

#endif

void CirrusColorExpandFillTile8(x, y, w, h, vtileaddr, tpitch, tbytes, theight,
toy, destpitch)
	int x, y, w, h;
	int vtileaddr;
	int tpitch;
	int tbytes;	/* Width of the tile in units of 8 pixels. */
	int theight, toy, destpitch;
{
	int i;
	int destlineaddr;
	int writebank;
	int tyindex;
	unsigned char *vtilep;
	int tcount;
	int *chunk8_tcount;
	int nspans;
	int bitoffset;
	int wbytes;
	int oneextracount;
	int tbyteindexleft, tbyteindexright, tbyteindexmiddle;
	unsigned char *readbase;	/* Video read window base address. */
	unsigned char *writebase;	/* Video write window */

	readbase = CIRRUSREADBASE();
	writebase = CIRRUSWRITEBASE();

	chunk8_tcount = (int *)ALLOCATE_LOCAL(tbytes * sizeof(int));

	/* Pointer to tile in read window in BY8 addressing mode. */
	vtileaddr >>= 3;
	CIRRUSSETREAD(vtileaddr);
	vtilep = CIRRUSREADBASE() + vtileaddr;

	destlineaddr = y * destpitch;	/* x added later. */

	/* Enable extended write modes, BY8 addressing, and 8 byte data */
	/* latches. Every addressing byte corresponds to 8 pixels. */
	if (cirrusUseLinear) {
		SETMODEEXTENSIONS(EXTENDEDWRITEMODES | BY8ADDRESSING |
			EIGHTDATALATCHES);
	}
	else {
		SETMODEEXTENSIONS(EXTENDEDWRITEMODES | BY8ADDRESSING |
			EIGHTDATALATCHES | DOUBLEBANKED);
	}

	SETWRITEMODE(1);

	SETPIXELMASK(0xff);
	SETFOREGROUNDCOLOR(0);		/* Disable set/reset. */

	destlineaddr >>= 3;			/* Divide address by 8. */

	CIRRUSSETWRITEB(destlineaddr, writebank);

	/* Current tile line index. */
	tyindex = (y - toy) % theight;

	bitoffset = x & 7;
	/* Calculate the number of full 8-pixel bytes in each scanline. */
	wbytes = (w - (bitoffset == 0 ? 0 : 8 - bitoffset)) / 8;
	/* Number of full tile 'widths'. */
	tcount = wbytes / tbytes;
	oneextracount = wbytes % tbytes;
	/* Calculate how many full 8 pixels chunks we can write per */
	/* scanline for each of the tbytes chunks in the tile line. */
	for (i = 0; i < oneextracount; i++)
		chunk8_tcount[i] = tcount + 1;
	while (i < tbytes) {
		chunk8_tcount[i] = tcount;
		i++;
	}

	/* Precompute value used in left edge loop. */
	if (bitoffset > 0) {
		tbyteindexleft = (x >> 3) % tbytes;
		tbyteindexmiddle = ((x + 7) >> 3) % tbytes;
	}
	else
		tbyteindexmiddle = (x >> 3) % tbytes;
	if (((x + w) & 7) > 0)
		tbyteindexright = ((x + w) >> 3) % tbytes;

	nspans = h;	/* Number of spans to go. */

	for (;;) {
		int n, minlines, oneextralimit, starttyindex;
		unsigned char *destp, *vtilelinep, *startvtilelinep;

		/* Calculate how many scanlines fit in this banking region. */
		n = CIRRUSWRITEREGIONLINES(destlineaddr, destpitch >> 3);
		if (n > nspans)
			n = nspans;
		nspans -= n;

		minlines = n / theight;
		oneextralimit = n % theight;

		starttyindex = tyindex;
		startvtilelinep = vtilep + tyindex * tbytes;

/* Left edge. */
		if (bitoffset == 0)
			goto skipleftedge;
		SETPIXELMASK(leftbitmask[bitoffset]);
		/* Address in write window. */
		destp = writebase + destlineaddr + (x >> 3);
		vtilelinep = startvtilelinep + tbyteindexleft;
		for (i = 0; i < n; i++) {
			__volatile__ unsigned char tmp;
			tmp = *vtilelinep;
			*destp = 0;
			destp += destpitch >> 3;
			tyindex++;
			vtilelinep += tbytes;
			if (tyindex == theight) {
				tyindex = 0;
				vtilelinep = vtilep + tbyteindexleft;
			}
		}
		SETPIXELMASK(0xff);
skipleftedge:

/* Middle part. */
		destp = writebase + destlineaddr + (x >> 3);
		if (bitoffset > 0)
			destp++;
		vtilelinep = startvtilelinep;
		tyindex = starttyindex;
		for (i = 0; i < theight && i < n; i++) {
			int linecount, k, tbyteindex;

			linecount = minlines;
			if (i < oneextralimit)
				linecount++;

			tbyteindex = tbyteindexmiddle;

			if (linecount > 0)
			for (k = 0; k < tbytes; k++) {
				/* Handle tile line pixels (k * 8) to */
				/* ((k + 1) * 8 - 1). */
				__volatile__ unsigned char tmp;
				/* Read 8 tile pixels into latches. */
				tmp = vtilelinep[tbyteindex];

				/* Now write them at every tbytes-th pixel */
				/* offset in every theight-th scanline in */
				/* the banking region. */

				if (chunk8_tcount[k] > 0)
					CirrusLatchWriteTileSpans(
						destp + k,
						chunk8_tcount[k],
						tbytes,
						linecount,
						(destpitch >> 3) * theight
						);
				tbyteindex++;
				if (tbyteindex >= tbytes)
					tbyteindex = 0;
			}

			destp += (destpitch >> 3);
			tyindex++;
			vtilelinep += tbytes;
			if (tyindex == theight) {
				tyindex = 0;
				vtilelinep = vtilep;
			}
		}

/* Right part. */
		if (((x + w) & 7) == 0)
			goto skiprightpart;
		SETPIXELMASK(rightbitmask[(x + w) & 7]);
		destp = writebase + destlineaddr + ((x + w) >> 3);
		vtilelinep = startvtilelinep + tbyteindexright;
		tyindex = starttyindex;
		for (i = 0; i < n; i++) {
			__volatile__ unsigned char tmp;
			tmp = *vtilelinep;
			*destp = 0;
			destp += destpitch >> 3;
			tyindex++;
			vtilelinep += tbytes;
			if (tyindex == theight) {
				tyindex = 0;
				vtilelinep = vtilep + tbyteindexright;
			}
		}
		SETPIXELMASK(0xff);
skiprightpart:

		if (nspans == 0)
			break;

		/* Calculate tile line index for next region. */
		tyindex = starttyindex + oneextralimit;
		if (tyindex >= theight)
			tyindex -= theight;

		destlineaddr += n * (destpitch >> 3);
		CIRRUSCHECKWRITEB(destlineaddr, writebank);
	}

	/* Disable extended write modes and BY8 addressing. */
	if (cirrusUseLinear) {
		SETMODEEXTENSIONS(SINGLEBANKED);
	}
	else {
		SETMODEEXTENSIONS(DOUBLEBANKED);
	}
	SETWRITEMODE(0);
	SETFOREGROUNDCOLOR(0x00);	/* Disable set/reset. */

	DEALLOCATE_LOCAL(chunk8_tcount);
}