/*
   This file is part of a programme to research fast Fermat factorisation algorithms.

   This is free software: you can redistribute it and/or modify
   it under the terms of version 3 of the GNU General Public License as
   published by the Free Software Foundation.

   It is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public Licence
   along with this programme.  If not, see <https://www.gnu.org/licenses/>.

   A copy of the licence is also available at
   https://www.cambridgeclarion.org/download/GPLv3.txt
   It has the following cryptographic hash
   SHA256 3972dc9744f6499f0f9b2dbf76696f2ae7ad8af9b23dde66d6af86c9dfb36986

*/

/*

    The software in this file was taken from the
    the public domain software YAFU and modified.

    The modifications are copyright 2022 Stephen Hewitt,
    licensed as above.

    The header in the original YAFU file follows:
*/

/*----------------------------------------------------------------------
This source distribution is placed in the public domain by its author,
Ben Buhrow. You may use it for any purpose, free of charge,
without having to notify anyone. I disclaim any responsibility for any
errors.

Optionally, please be nice and tell me if you find this source to be
useful. Again optionally, if you add to the functionality present here
please consider making those additions public too, so that others may
benefit from your work.	

Some parts of the code (and also this header), included in this
distribution have been reused from other sources. In particular I
have benefitted greatly from the work of Jason Papadopoulos's msieve @
www.boo.net/~jasonp, Scott Contini's mpqs implementation, and Tom St.
Denis Tom's Fast Math library.  Many thanks to their kind donation of
code to the public domain.
       				   --bbuhrow@gmail.com 11/24/09
----------------------------------------------------------------------*/

#include <sys/time.h>
#include <stdlib.h>
#include "gmp.h"
#include "zf.h"

typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned long uint32;
typedef unsigned long long uint64;

#define setbit(a,b) (((a)[(b) >> 3]) |= (nmasks[(b) & 7]))
#define getbit(a,b) (((a)[(b) >> 3]) & (nmasks[(b) & 7]))

int zFermat2(Output *pout, unsigned long M, mpz_t gmp_n)
{
    // Fermat's factorization method with a sieve-based improvement
    // provided by 'neonsignal'
    mpz_t a, b2, tmp, multN, a2;
    int i;
    uint64 i64;
    uint32 M1 = 11 * 17 * 23 * 31; //133331u
    uint32 M2 = 13 * 19 * 29 * 37; //265031u
    uint8 *sqr, *sqr1, *sqr2, *mod, *mod1, *mod2;
    uint16 *skip;
    uint32 m, mmn, s, d;
    uint8 masks[8] = {0xfe, 0xfd, 0xfb, 0xf7, 0xef, 0xdf, 0xbf, 0x7f};
    uint8 nmasks[8];
    uint32 iM1 = 0, iM2 = 0;
    uint32 mult = 1; // multiplier not supported for now
    int result;
    struct timeval tv_enter;
    struct timeval tv_start_search;
    struct timeval tv_found;
    /* skip_n is the number of filter-passing patterns in the entire modulo M pattern space.
       This will be the size of skip[]
       This function puts the value into skip_n by counting.
       However in principle is already known and could be determined from M
       (since RR for every M is known from the published reduction ratio tables)
       Leaving the counting as a check for now.
    */
    unsigned skip_n;

    gettimeofday(&tv_enter, 0);

    mpz_init(a);
    mpz_init(b2);
    mpz_init(tmp);
    mpz_init(multN);
    mpz_init(a2);

    // apply the user supplied multiplier
    mpz_mul_ui(multN, gmp_n, mult);

    // compute ceil(sqrt(multN))
    mpz_sqrt(a, multN);

    // form b^2
    mpz_mul(b2, a, a);
    mpz_sub(b2, b2, multN);

    // test successive 'a' values using a sieve-based approach.
    // the idea is that not all 'a' values allow a^2 or b^2 to be square.
    // we pre-compute allowable 'a' values modulo various smooth numbers and
    // build tables to allow us to quickly iterate over 'a' values that are
    // more likely to produce squares.
    // init sieve structures
    sqr = (uint8 *)calloc((M / 8 + 1) , sizeof(uint8));
    sqr1 = (uint8 *)calloc((M1 / 8 + 1) , sizeof(uint8));
    sqr2 = (uint8 *)calloc((M2 / 8 + 1) , sizeof(uint8));
    mod = (uint8 *)calloc((M / 8 + 1) , sizeof(uint8));
    mod1 = (uint8 *)calloc((M1 / 8 + 1) , sizeof(uint8));
    mod2 = (uint8 *)calloc((M2 / 8 + 1) , sizeof(uint8));

    // test it.  This will be good enough if |u*p-v*q| < 2 * N^(1/4), where
    // mult = u*v
    if (mpz_perfect_square_p(b2))
        goto found;

    for (i=0; i<8; i++)
        nmasks[i] = ~masks[i];

    // marks locations where squares can occur mod M, M1, M2
    for (i64 = 0; i64 < M; ++i64)
        setbit(sqr, (i64*i64)%M);

    for (i64 = 0; i64 < M1; ++i64)
        setbit(sqr1, (i64*i64)%M1);

    for (i64 = 0; i64 < M2; ++i64)
        setbit(sqr2, (i64*i64)%M2);

    // for the modular sequence of b*b = a*a - n values
    // (where b2_2 = b2_1 * 2a + 1), mark locations where
    // b^2 can be a square
    m = mpz_mod_ui(tmp, a, M);
    mmn = mpz_mod_ui(tmp, b2, M);
    skip_n = 0;
    for (i = 0; i < M; ++i)
    {
        if (getbit(sqr, mmn))
        {
            setbit(mod, i);
            skip_n++;
        }
        mmn = (mmn+m+m+1)%M;
        m = (m+1)%M;
    }

    skip = (uint16 *)malloc(skip_n * sizeof(uint16));
    pout->skip_n = skip_n;

    // each position in skip[] will contain the absolute increase in big presquare to the next candidate
    d = 0;
    unsigned start_offset = 0;
    for (i = 0; !getbit(mod,i); ++i)
        ++start_offset;

    s = start_offset;
    unsigned skindex = skip_n - 1;
    unsigned max_skip = start_offset; // it must be at least this, even if this skip value does not exist
    i = M;
    while (i > 0)
    {
        --i;
        ++s;
        if (getbit(mod,i)) // ie found a candidate
        {
            skip[skindex--] = s; // because s is how far we have counted backwards from the next candidate
            if (s > max_skip)
                max_skip = s;
            s = 0;
        }
    }
    if (max_skip > 0xffff)
    {
        result = 233; // "fatal there is a skip too big for the 2-byte skip table elements"
        goto free_and_return;
    }
    pout->max_skip = max_skip;

    // for the modular sequence of b*b = a*a - n values
    // (where b2_2 = b2_1 * 2a + 1), mark locations where the
    // modular sequence can be a square mod M1.  These will
    // generally differ from the sequence mod M.
    m = mpz_mod_ui(tmp, a, M1);
    mmn = mpz_mod_ui(tmp, b2, M1);
    for (i = 0; i < M1; ++i)
    {
        if (getbit(sqr1, mmn)) setbit(mod1, i);
        mmn = (mmn+m+m+1)%M1;
        m = (m+1)%M1;
    }

    // for the modular sequence of b*b = a*a - n values
    // (where b2_2 = b2_1 * 2a + 1), mark locations where the
    // modular sequence can be a square mod M2.  These will
    // generally differ from the sequence mod M or M1.
    m = mpz_mod_ui(tmp, a, M2);
    mmn = mpz_mod_ui(tmp, b2, M2);
    for (i = 0; i < M2; ++i)
    {
        if (getbit(sqr2, mmn)) setbit(mod2, i);
        mmn = (mmn+m+m+1)%M2;
        m = (m+1)%M2;
    }

    gettimeofday(&tv_start_search, 0);

    // loop, checking for perfect squares
    mpz_mul_2exp(a2, a, 1);

    s = start_offset;
    skindex = 0;
    do
    {
        d = 0;
        do
        {
            // remember how far we skipped
            d += s;

            // update the other residue indices
            if ((iM1 += s) >= M1) iM1 -= M1;
            if ((iM2 += s) >= M2) iM2 -= M2;

            // skip to the next possible square residue of b*b mod M
            s = skip[skindex];

            skindex++;
            if (skindex == skip_n)
                skindex = 0;

            // continue if either of the other residues indicates
            // non-square.
        } while (!getbit(mod1,iM1) || !getbit(mod2,iM2));

        // form b^2 by incrementing by many factors of 2*a+1
        mpz_add_ui(tmp, a2, d);
        mpz_mul_ui(tmp, tmp, d);
        mpz_add(b2, b2, tmp);

        // accumulate so that we can reset d
        // (and thus keep it single precision)
        mpz_add_ui(a2, a2, d*2);

    } while (!mpz_perfect_square_p(b2));

found:
    gettimeofday(&tv_found, 0);

    pout->setup_secs = elapsed_secs(tv_start_search, tv_enter);
    pout->search_secs = elapsed_secs(tv_found, tv_start_search);

    if ((mpz_size(b2) > 0) && mpz_perfect_square_p(b2))
        result = check_result(b2, gmp_n);
    else
        result = 134; // "fatal b2 is not as expected"

free_and_return:
    mpz_clear(tmp);
    mpz_clear(a);
    mpz_clear(b2);
    mpz_clear(multN);
    mpz_clear(a2);
    free(sqr);
    free(sqr1);
    free(sqr2);
    free(mod);
    free(mod1);
    free(mod2);
    free(skip);

    return result;
}