/*
 * Copyright (c) 1991, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Ralph Campbell.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <mips/asm.h>
#include <mips/regdef.h>
#include <mips/m32c0.h>

	.set	nomips16

#if __mips64
#define LL	ldl
#define LR	ldr
#define SL	sdl
#define SR	sdr
#else
#define LL	lwl
#define LR	lwr
#define SL	swl
#define SR	swr
#endif

#ifdef __MIPSEL__
#define	LHI	LR
#define	LLO	LL
#define	SHI	SR
#define	SLO	SL
#else
#define	LHI	LL
#define	LLO	LR
#define	SHI	SL
#define	SLO	SR
#endif

#ifndef PREF_LOAD
#define PREF_LOAD		0
#endif
#ifndef PREF_PREPAREFORSTORE
#define PREF_PREPAREFORSTORE	30
#endif

#if 0
/* 24Kc FPGA, 24 MHz, CPU:BUS:RAM speed 8:2:1, 16k D$,
   CoreFPGA2-24Kc_8.269-sandesh-XC2V8000.fl */
#define PREF_LOAD_BIAS_UNALIGNED 0x40
#define PREF_STORE_BIAS_UNALIGNED 0x1780
#define PREF_LOAD_BIAS_ALIGNED 0x60
#define PREF_STORE_BIAS_ALIGNED 0x1780
#endif
#if 0
/* 24Kc LV, 266 MHz, CPU:BUS:RAM speed 4:2:1, 16k D$ */
#define PREF_LOAD_BIAS_UNALIGNED 0x40
#define PREF_STORE_BIAS_UNALIGNED 0x720
#define PREF_LOAD_BIAS_ALIGNED 0x40
#define PREF_STORE_BIAS_ALIGNED 0x7a0
#endif
#if 0
/* 20Kc LV, 500 MHz, CPU:RAM speed 5:1,
   NOTE: Only ~20% improvement with prefetching */
#define PREF_LOAD_BIAS_UNALIGNED 0x40
#define PREF_STORE_BIAS_UNALIGNED 0x20
#define PREF_LOAD_BIAS_ALIGNED 0x40
#define PREF_STORE_BIAS_ALIGNED 0x20
#endif
#if 0
/* 34Kc FPGA, 24 MHz, CPU:BUS:RAM speed 8:2:1, 16k D$,
   CoreFPGA2-34Kc_6.125_3TC-sandesh-XC2V8000.fl */
#define PREF_LOAD_BIAS_UNALIGNED 0x1020
#define PREF_STORE_BIAS_UNALIGNED 0x20
#define PREF_LOAD_BIAS_ALIGNED 0x980
#define PREF_STORE_BIAS_ALIGNED 0x20
#endif

#if 1
/* preliminary "generic" version */
#define PREF_LOAD_BIAS_UNALIGNED 0x60
#define PREF_STORE_BIAS_UNALIGNED 0x60
#define PREF_LOAD_BIAS_ALIGNED 0x60
#define PREF_STORE_BIAS_ALIGNED 0x60
#endif


/* We assume to have 32 byte dcache lines, but we are still careful not
   to break anything if they are larger. Specifically, we aren't allowed
   to prefetch beyond the end of our copy area. */
#define CACHELINE_SIZE		32
#define MAX_CACHELINE_SIZE	128

.ifgt PREF_LOAD_BIAS_UNALIGNED-PREF_STORE_BIAS_UNALIGNED
NOFETCH_SIZE_UNALIGNED=	(2 * MAX_CACHELINE_SIZE - SZREG) + PREF_LOAD_BIAS_UNALIGNED
.else
NOFETCH_SIZE_UNALIGNED=	(2 * MAX_CACHELINE_SIZE - SZREG) + PREF_STORE_BIAS_UNALIGNED
.endif
.ifgt PREF_LOAD_BIAS_ALIGNED-PREF_STORE_BIAS_ALIGNED
NOFETCH_SIZE_ALIGNED=	(2 * MAX_CACHELINE_SIZE - SZREG) + PREF_LOAD_BIAS_ALIGNED
.else
NOFETCH_SIZE_ALIGNED=	(2 * MAX_CACHELINE_SIZE - SZREG) + PREF_STORE_BIAS_ALIGNED
.endif

/* memcpy(to, from, n) */
LEAF(memcpy)
	.set	noreorder
	move	v0, a0			# save to for return
	beqz	a2, leave
	 sltu	t2, a2, 12		# check for small copy

	bnez	t2, smallcpy		# do a small bcopy
	 xor	v1, a1, a0		# compare low bits of addresses
	and	v1, SZREG-1
	subu	a3, zero, a0		# compute # bytes to word align address
	beqz	v1, aligned		# addresses can both be word aligned
	 and	a3, SZREG-1

	beqz	a3, 1f
	 subu	a2, a3			# subtract from remaining count
	LHI	v1, 0(a1)		# get next SZREG bytes (unaligned)
	LLO	v1, SZREG-1(a1)
	addu	a1, a3
	SHI	v1, 0(a0)		# store 0..SZREG-1 bytes to align a0
	addu	a0, a3

	/* Try a CACHELINE_SIZE unrolled unaligned block copy with prefetch */
1:	and	v1, a2, CACHELINE_SIZE - 1	# remaining size % blocksize
	subu	a3, a2, v1			# size of remaining blocks
	beqz	a3, unaligned_wordcpy		# none?
	 move	a2, v1				# bytes remaining after block copy
	addu	a3, a1				# compute ending address
	subu	t3, a3, NOFETCH_SIZE_UNALIGNED

	bgeu	a1, t3, 2f
	 nop

	pref	PREF_LOAD, PREF_LOAD_BIAS_UNALIGNED-2*CACHELINE_SIZE(a1)
	pref	PREF_LOAD, PREF_LOAD_BIAS_UNALIGNED-1*CACHELINE_SIZE(a1)
	pref	PREF_PREPAREFORSTORE, PREF_STORE_BIAS_ALIGNED-2*CACHELINE_SIZE(a0)
	pref	PREF_PREPAREFORSTORE, PREF_STORE_BIAS_ALIGNED-1*CACHELINE_SIZE(a0)

1:	pref	PREF_LOAD, PREF_LOAD_BIAS_UNALIGNED(a1)
	pref	PREF_PREPAREFORSTORE, PREF_STORE_BIAS_UNALIGNED(a0)
2:	LHI	v1, SZREG*0(a1)			# copy block a1 unaligned, a0 aligned
	LLO	v1, SZREG*0+SZREG-1(a1)
	LHI	t0, SZREG*1(a1)
	LLO	t0, SZREG*1+SZREG-1(a1)
	LHI	t1, SZREG*2(a1)
	LLO	t1, SZREG*2+SZREG-1(a1)
#ifdef __mips64
	LHI	ta3, SZREG*3(a1)
	LLO	ta3, SZREG*3+SZREG-1(a1)
#else
	LHI	t2, SZREG*3(a1)
	LLO	t2, SZREG*3+SZREG-1(a1)
	LHI	ta0, SZREG*4(a1)
	LLO	ta0, SZREG*4+SZREG-1(a1)
	LHI	ta1, SZREG*5(a1)
	LLO	ta1, SZREG*5+SZREG-1(a1)
	LHI	ta2, SZREG*6(a1)
	LLO	ta2, SZREG*6+SZREG-1(a1)
	LHI	ta3, SZREG*7(a1)
	LLO	ta3, SZREG*7+SZREG-1(a1)
#endif
	REG_S	v1, SZREG*0(a0)
	REG_S	t0, SZREG*1(a0)
	REG_S	t1, SZREG*2(a0)
#ifndef __mips64
	REG_S	t2, SZREG*3(a0)
	REG_S	ta0, SZREG*4(a0)
	REG_S	ta1, SZREG*5(a0)
	REG_S	ta2, SZREG*6(a0)
#endif
	addu	a1, CACHELINE_SIZE
	addu	a0, CACHELINE_SIZE
	bltu	a1, t3, 1b
	 REG_S	ta3, -SZREG(a0)
	bne	a1, a3, 2b
	 nop
	
unaligned_wordcpy:
	and	v1, a2, SZREG-1		# compute number of words left
	subu	a3, a2, v1		
	beqz	a3, smallcpy		# none?
	 move	a2, v1			# bytes remaining after word copy
	addu	a3, a1			# compute ending address

1:	LHI	v1, 0(a1)		# copy words a1 unaligned, a0 aligned
	LLO	v1, SZREG-1(a1)
	addu	a1, SZREG
	addu	a0, SZREG
	bne	a1, a3, 1b
	 REG_S	v1, -SZREG(a0)
	
	b	smallcpy
	 nop

aligned:
	/* Both addresses have the same alignment: do initial bytes to align */
	beqz	a3, 1f
	 subu	a2, a3			# subtract from remaining count
	LHI	v1, 0(a1)		# copy 1, 2, or 3 bytes to align
	addu	a1, a3
	SHI	v1, 0(a0)
	addu	a0, a3

	/* Try a CACHELINE_SIZE unrolled aligned block copy with prefetch */
1:	and	v1, a2, CACHELINE_SIZE - 1	# remaining size % blocksize
	subu	a3, a2, v1			# size of remaining blocks
	beqz	a3, aligned_wordcpy		# none?
	 move	a2, v1				# bytes remaining after block copy
	addu	a3, a1				# compute ending address
	subu	t3, a3, NOFETCH_SIZE_ALIGNED

	bgeu	a1, t3, 2f
	 nop

	pref	PREF_LOAD, PREF_LOAD_BIAS_ALIGNED-2*CACHELINE_SIZE(a1)
	pref	PREF_LOAD, PREF_LOAD_BIAS_ALIGNED-1*CACHELINE_SIZE(a1)
	pref	PREF_PREPAREFORSTORE, PREF_STORE_BIAS_ALIGNED-2*CACHELINE_SIZE(a0)
	pref	PREF_PREPAREFORSTORE, PREF_STORE_BIAS_ALIGNED-1*CACHELINE_SIZE(a0)

1:	pref	PREF_LOAD, PREF_LOAD_BIAS_ALIGNED(a1)
	pref	PREF_PREPAREFORSTORE, PREF_STORE_BIAS_ALIGNED(a0)
2:	REG_L	v1, SZREG*0(a1)
	REG_L	t0, SZREG*1(a1)
	REG_L	t1, SZREG*2(a1)
#ifdef __mips64
	REG_L	ta3, SZREG*3(a1)
#else
	REG_L	t2, SZREG*3(a1)
	REG_L	ta0, SZREG*4(a1)
	REG_L	ta1, SZREG*5(a1)
	REG_L	ta2, SZREG*6(a1)
	REG_L	ta3, SZREG*7(a1)
#endif
	REG_S	v1, SZREG*0(a0)
	REG_S	t0, SZREG*1(a0)
	REG_S	t1, SZREG*2(a0)
#ifndef __mips64
	REG_S	t2, SZREG*3(a0)
	REG_S	ta0, SZREG*4(a0)
	REG_S	ta1, SZREG*5(a0)
	REG_S	ta2, SZREG*6(a0)
#endif
	addu	a1, CACHELINE_SIZE
	addu	a0, CACHELINE_SIZE
	bltu	a1, t3, 1b
	 REG_S	ta3, -SZREG(a0)
	bne	a1, a3, 2b
	 nop

	/* Try a word at a time */
aligned_wordcpy:
	and	v1, a2, SZREG-1		# remaining size % word size
	subu	a3, a2, v1		# size of remaining words
	beqz	a3, smallcpy		# none?
	 move	a2, v1			# bytes remaining after word copy
	addu	a3, a1			# compute ending address

1:	REG_L	v1, 0(a1)		# copy words
	addu	a1, SZREG
	addu	a0, SZREG
	bne	a1, a3, 1b
	 REG_S	v1, -SZREG(a0)

smallcpy:
	/* Last resort: byte at a time */
	beqz	a2, leave
	 addu	a3, a2, a1		# compute ending address

1:	lbu	v1, 0(a1)		# copy bytes
	addu	a1, 1
	addu	a0, 1
	bne	a1, a3, 1b
	 sb	v1, -1(a0)

leave:	j	ra
	 nop
END(memcpy)
