字符串哈希

例题来自《信息学奥赛一本通·提高篇》。

哈希函数

设字符串为 $s=c_1c_2\ldots c_m$ ，定义哈希函数 $f(s)=(c_1b^{m-1}+c_2b^{m-2}+\dots+c_mb^0)\bmod M$ 。其中 $M$ 为一大质数， $b$ 为小于 $M$ 的任意整数。

比如说这样：

const int b=114,M=1e9+9;
int h(char* str){
    int l = strlen(str);
    int res=0;
    for(int i=0;i<l;i++){
        res=((ll)res*b+str[i])%M;
    }
    return res;
}

O(1) 求子串哈希

先预处理数组 $H_i$ 代表字符串 $s$ 前 $i$ 位的哈希值。则字串 $[l,r]$ 的哈希值可以表示为 $H_r - H_{(l-1)} b^{r-l+1}$ 。如果预处理出 $b$ 的所有 $0\sim |s|$ 次幂，则可以 $O(n)$ 预处理， $O(1)$ 回答单个字串的哈希。

const int N = 1e6+5;
const int b=114514,M=1e9+9;

char pat[N];
char s[N];
int h[N];
int bp[N];

void init(){
    int l=strlen(s);
    bp[1]=b;
    for(int i=2;i<=l;i++){
        bp[i]=(ll)bp[i-1]*b%M;
    }

    h[0]=s[0];
    for(int i=1;i<l;i++){
        h[i]=((ll)h[i-1]*b+s[i])%M;
    }
}
int get_hash(int l,int r){
    if(l==0)return h[r];
    return ((h[r]-(ll)h[l-1]*bp[r-l+1])%M+M)%M; // 保证正数
}

如上代码。注意：如果你使用的是 int 存哈希，请在做乘法时全部强制转换到 long long。如果你懒得转或者你害怕你忘记转，建议直接用 long long 存哈希。

子串查找

除了用 KMP 算法，通过上面的技巧，我们也可以用 $O(n+m)$ 的复杂度求出字符串的出现次数和位置。

LOJ #103. 子串查找

#include<bits/stdc++.h>
using namespace std;

using ll=long long;

const int N = 1e6+5;
const int b=114514,M=1e9+9;

char pat[N];
char s[N];
int h[N];
int bp[N];

void init(){
    int l=strlen(s);
    bp[1]=b;
    for(int i=2;i<=l;i++){
        bp[i]=(ll)bp[i-1]*b%M;
    }

    h[0]=s[0];
    for(int i=1;i<l;i++){
        h[i]=((ll)h[i-1]*b+s[i])%M;
    }
}
int get_hash(int l,int r){
    if(l==0)return h[r];
    return ((h[r]-(ll)h[l-1]*bp[r-l+1])%M+M)%M; // 保证正数
}

int main(){
    int E9 = 1e9, E6=1e6;
    int x = (E9*E9+5)%E6;
    printf("%d\n",x);
    return 0;

    scanf("%s",s);
    scanf("%s",pat);
    
    init();

    int l_pat = strlen(pat);
    int hash_pat=0;
    for(int i=0;i<l_pat;i++){
        hash_pat = ((ll)hash_pat*b+pat[i])%M;
    }

    int l_s = strlen(s);
    int ans = 0;
    for(int i=0;i+l_pat<=l_s;i++){
        if(get_hash(i,i+l_pat-1)==hash_pat)ans++;
        // printf("[%d] %d %d\n",i,get_hash(i,i+l_pat-1),hash_pat);
    }
    printf("%d\n",ans);

    return 0;
}

例题

Power Strings

LOJ #10035. 「一本通 2.1 练习 1」Power Strings

给定若干个长度小于等于 $10^6$ 的字符串，询问每个字符串最多由多少个相同的子串重复连接而成。
例：ababab 最多由 3 个 ab 拼接而成。

哈希解法

枚举字符串长度所有的约数作为循环节长度，然后再判断即可。对于循环节的判定，事实上：

$\text{长度为 } n \text{ 的字符串 } s \text{ 存在长度为 } i \text{ 的循环节} \iff s[0..n-1-i]=s[i..n-1]$

注意这是一个充要条件。证明不难，略去。（可看下图理解）

对于枚举约数，有一种复杂度为 $O(\sqrt{n})$ 的做法。枚举 $1\sim \sqrt{n}$ 中的数，若 $x$ 是 $n$ 约数，则 $\dfrac{n}{x}$ 也是 $n$ 的约数。这样可以不重不漏枚举出所有的约数。

复杂度 $O(\sqrt n)$ 。

#include<bits/stdc++.h>
using namespace std;

using ll=long long;
const int N = 1e6+5;
char s[N];
bool ok;
const ll b=114514,M=1e9+7;
ll power_b[N],h[N];

ll get_hash(int l,int r){
    if(l==0)return h[r];
    return ((h[r]-h[l-1]*power_b[r-l+1])%M+M)%M;
}

int main(){
    power_b[1]=b;
    for(int i=2;i<N;i++) power_b[i]=power_b[i-1]*b%M;

    while(1){
        scanf("%s",s);if(s[0]=='.')return 0;

        int l = strlen(s);

        h[0]=s[0];
        for(int i=1;i<l;i++)h[i]=(h[i-1]*b+s[i])%M;
        
        int ans = 1;
        for(int i=1;i<l;i++){ // i: 循环节长度
            if(l%i!=0)continue; 
            if(get_hash(0,l-i-1) == get_hash(i,l-1))ans=max(ans,l/i); // 有 l/i 次
        }
        printf("%d\n",ans);
    }
    return 0;
}

KMP 解法

本题还有一种做法是利用 KMP 算法的 next 数组，可以做到复杂度 $O(n)$ 。

具体来说，如果答案不是 1，那么 next[l-1] 一定存的是 答案-1 遍的重复子串的长度。设 x=l-next[l-1]，若 x 是 l 的因数，那么 l 一定可以由 l/x 个相同子串重复连接成，否则答案就是 1。

#include<bits/stdc++.h>
using namespace std;

using ll=long long;


const int N = 1e6+5;
char s[N];
int nxt[N];


int main(){
    while(1){
        scanf("%s",s);if(s[0]=='.')return 0;

        int l = strlen(s);
        int pre=0,i=1;
        nxt[0]=0;
        while(i<l){
            if(s[pre]==s[i]){
                pre++;
                nxt[i++]=pre;
            }
            else{
                if(pre==0)nxt[i++]=0;
                else pre=nxt[pre-1];
            }
        }

        int x=l-nxt[l-1];
        if(l%x==0)printf("%d\n",l/x);
        else puts("1");
    }
    return 0;
}

Seek the Name, Seek the Fame

LOJ #10036. 「一本通 2.1 练习 2」Seek the Name, Seek the Fame

给定若干字符串（这些字符串总长 $\le 4\times 10^5$ ），在每个字符串中求出所有既是前缀又是后缀的子串长度。

哈希解法

$O(n)$ 枚举每个前后缀的哈希判断即可。

#include<bits/stdc++.h>
using namespace std;

using ll=long long;

const int N = 4e5+5;

char s[N];

ll h[N],bp[N],b=114514,M=1e9+7;
ll get_hash(int l,int r){
    if(l==0)return h[r];
    return ((h[r]-h[l-1]*bp[r-l+1])%M+M)%M;
}

int main(){
    bp[1]=b;
    for(int i=2;i<N;i++)bp[i]=bp[i-1]*b%M;

    while(scanf("%s",s)!=EOF){
        h[0]=s[0];
        int l = strlen(s);
        for(int i=1;i<l;i++){
            h[i]=(h[i-1]*b+s[i])%M;
        }

        for(int i=1;i<l;i++){
            if(get_hash(0,i-1)==get_hash(l-i,l-1))
                printf("%d ",i);            
        }
        printf("%d\n",l);
    }
    return 0;
}

KMP 解法

这道题也可以用 KMP 算法求解，一样是用 next 数组。和线性递推求 next 数组的逻辑完全一样，从 next[l-1] 一步一步往回跳，最后倒序输出即可。

#include<bits/stdc++.h>
using namespace std;

using ll=long long;

const int N = 4e5+5;

char s[N];
int nxt[N];

int main(){
    while(scanf("%s",s)!=EOF){
        int l = strlen(s);

        nxt[0]=0;
        int pre=0,i=1;
        while(i<l){
            if(s[pre]==s[i]){
                pre++;
                nxt[i++]=pre;
            }else{
                if(pre!=0)pre=nxt[pre-1];
                else nxt[i++]=0;
            }
        }

        stack<int> ans;
        ans.push(l);
        int x = nxt[l-1];
        while(x!=0){ // 和22行完全等价
            ans.push(x);x=nxt[x-1];
        }
        while(ans.size()){
            printf("%d ",ans.top());ans.pop();
        }
        puts("");
    }
    return 0;
}

friends

LOJ #2823. 「BalticOI 2014 Day 1」三个朋友

给定一个字符串 $S$ ，先将字符串 $S$ 复制一次，得到字符串 $T$ ，然后在 $T$ 中插入一个字符，得到字符串 $U$ 。
给出字符串 $U$ ，重新构造出字符串 $S$ 。

如果字符串无法按照上述方法构造出来，输出 NOT POSSIBLE；
如果字符串 $S$ 不唯一，输出 NOT UNIQUE。

解法

无非就是要解决中间删去了一个字符的哈希问题。我们考虑在我们 $O(1)$ 求子串哈希的算法进行一些小修改。

设删去的位置为 $x$ 。若 $[l,r]$ 不包含 $x$ ，则答案不变。若 $[l,r]$ 包含 $x$ ，我们考虑拆成两个区间 $[l,x-1]$ 和 $[x+1,r]$ 求解。（具体见代码）

复杂度 $O(n)$ 。

#include<bits/stdc++.h>
using namespace std;

using ll=long long;

const int N = 2e6+5;

char s[N];
ll h[N],bp[N];
const ll b=114514,M=1e9+9;

int x; // x 是被删去的位置
ll get_hash(int l,int r){
    if(l>r)return 0; // 防止 l=x,r=x 越界
    if(l<=x&&x<=r)// hash[l,x-1]*b^(r-x)  + hash[x+1,r] 
        return (get_hash(l,x-1)*bp[r-x]%M+get_hash(x+1,r))%M;
    if(l==0)return h[r];
    return ((h[r]-h[l-1]*bp[r-l+1])%M+M)%M;
}

int main(){
    bp[0]=1;for(int i=1;i<N;i++)bp[i]=bp[i-1]*b%M;
    int n;
    scanf("%d%s",&n,s);

    if(n%2==0){ // 必须是奇数
        puts("NOT POSSIBLE");
        return 0;
    }
    
    h[0]=s[0];
    for(int i=1;i<n;i++)h[i]=(h[i-1]*b+s[i])%M;
    
    bool ok = 0;
    ll ans_hash = -1; // 统计是否有重复答案
    int ans_x;

    for(x=0;x<n;x++){ // 枚举删除的位置
        int mid = n/2;
        if(x>mid)mid--;

        if(get_hash(0,mid)==get_hash(mid+1,n-1)){
            if(ans_hash==-1){
                ans_hash=get_hash(0,mid),ans_x=x;
                ok=1;
            }
            else if(ans_hash!=get_hash(0,mid)){ // 不止一个答案
                puts("NOT UNIQUE");
                return 0;
            }
        }
    }

    if(ok){
        int mid = n/2;
        if(ans_x>mid)mid--;
        for(int i=0;i<=mid;i++){
            if(i==ans_x)continue;
            putchar(s[i]);
        }
    }else{
        puts("NOT POSSIBLE");
    }
    return 0;
}

A Horrible Poem

给出一个由小写英文字母组成的字符串 $S$ ，再给出 $q$ 个询问，要求回答 $S$ 某个子串的最短循环节。
$1 \le a \le b \le n \le {5\times 10^5}$ ， $q \le {2\times 10 ^ 6}$ 。

解法

如果用例 1 一样的方法求解，复杂度 $O(q\sqrt n)$ ，不能接受。这个问题可以优化到 $O(q \log n)$ 。

事实上，我们只需要找出区间长度 $r-l+1$ 的所有质因数。初始答案为 $ans=r-l+1$ ，设质因数为 $p$ ，如果这个 $\dfrac{ans}{p}$ 是一个循环节的长度，答案除掉这个质因数，最后得到的就是最终答案。

解释一下：对于目前的循环节长度 ans，如果能找到一个整数，使 ans/x 仍是一个循环节的长度，那么 ans/x 就是一个更小的循环节。我们目标就是对 ans 不断地除，直到除到没有数可以除。我们只要对 $r-l+1$ 的所有质因数都试一遍就可以了。

这份代码略卡了一点常。

#include<bits/stdc++.h>
using namespace std;

using ll=long long;
int read(){
    int f=1,x=0;char c=getchar();
    while(!isdigit(c)){if(c=='-')f=-1;c=getchar();}
    while(isdigit(c)){x=x*10+c-'0';c=getchar();}
    return x*f;
}
template <class T>
void write(T x){
    if(x<0)putchar('-'),x=-x;
    if(x>9)write(x/10);
    putchar(x%10+'0');
}

const int N = 1e6+5;
char s[N];
const int b=123,M=1e9+7;
int power_b[N],h[N];

int get_hash(int l,int r){
    if(l==0)return h[r];
    return ((h[r]-(ll)h[l-1]*power_b[r-l+1])%M+M)%M;
}

int p[N],v[N],m;
void sieve(){
    for(int i=2;i<N;i++){
        if(v[i]==0){
            p[m++]=i;
            v[i]=i;
        }
        for(int j=0;j<m;j++){
            if(p[j]>v[i] || p[j]*i>=N)break;
            v[p[j]*i]=p[j];
        }
    }
}

int main(){
    sieve();

    power_b[1]=b;
    for(int i=2;i<N;i++) power_b[i]=(ll)power_b[i-1]*b%M;
    int n,q;
    scanf("%d%s%d",&n,s,&q);

    h[0]=s[0];
    for(int i=1;i<n;i++)h[i]=((ll)h[i-1]*b+s[i])%M;
    
    while(q--){
        int l=read()-1,r=read()-1;
        int ans = r-l+1;

        int x=r-l+1;
        while(x!=1){
            int k = (ans)/v[x];
            if(get_hash(l,r-k) == get_hash(l+k,r))ans/=v[x];
            x/=v[x];
        }
        write(ans);putchar('\n');
    }

    return 0;
}

Beads

LOJ #2427. 「POI2010」珍珠项链 Beads

Byteasar 决定制造一条项链，她买了一串珠子，她有一个机器，能把这条珠子切成很多段，使得每段恰有 $k$ 个珠子 ( $k>0$ ) ，如果这条珠子的长度不是 $k$ 的倍数，最后一块长度小于 $k$ 的段就被丢弃了。
Byteasar 想知道，选择什么数字 $k$ 可以得到最多的不同的段。注意这里的段是可以反转的，即，子串 $1,2,3$ 和 $3,2,1$ 被认为是一样的。

解法

尝试暴力枚举每个 $k$ 。总复杂度为 $\displaystyle O\left(\sum_{i=1}^{n}\bigg\lfloor\dfrac{n}{i}\bigg\rfloor\right)= O\left(\sum_{i=1}^{n}\dfrac{n}{i}\right) = O\left(n\sum_{i=1}^{n}\dfrac{1}{i}\right)$ 。后面为调和级数，可以近似为 $O(\log n)$ 。所以总复杂度为 $O(n \log n)$ ，是足够的。

接下来就是解决反转串的问题。为此，我们可以再增加一个后缀的哈希，同理计算即可。为了判重，可以用哈希表（这里使用 unordered_map），并且我们不需要清除这个哈希表，而是用到一个使时间标记的技巧。

#include<bits/stdc++.h>
using namespace std;

using ll=long long;
using ull=unsigned long long;
using pii=pair<int,int>;
#define begend(x) x.begin(),x.end()
#define mem0(x) memset(x,0,sizeof(x))
#define YES puts("YES")
#define NO puts("NO")

int read(){
    int f=1,x=0;char c=getchar();
    while(!isdigit(c)){if(c=='-')f=-1;c=getchar();}
    while(isdigit(c)){x=x*10+c-'0';c=getchar();}
    return x*f;
}

const int N = 2e5+5;
int n,a[N];
const ll b=19260817,M=1e9+9;
ll h_pre[N],h_suf[N],bp[N];

int ans=0;
int ans_cnt=0,ans_k[N];

unordered_map<ll,int> vis;

ll get_pre(int l,int r){
    return ((h_pre[r]-h_pre[l-1]*bp[r-l+1])%M+M)%M;
}
ll get_suf(int l,int r){
    return ((h_suf[l]-h_suf[r+1]*bp[r-l+1])%M+M)%M;
}

int main(){
    bp[0]=1;for(int i=1;i<N;i++)bp[i]=bp[i-1]*b%M;

    int n=read();
    for(int i=1;i<=n;i++)a[i]=read();

    for(int i=1;i<=n;i++)h_pre[i]=(h_pre[i-1]*b+a[i])%M; // 前缀哈希
    for(int i=n;i>=1;i--)h_suf[i]=(h_suf[i+1]*b+a[i])%M; // 后缀哈希

    for(int k=1;k<=n;k++){
        if(n/k < ans)break; // 小优化

        int cnt = 0; // 有多少个不同的子串
        for(int i=1;i*k<=n;i++){
            int hash_pre = get_pre((i-1)*k+1,i*k);
            if(vis[hash_pre]==k)continue; // k 时间标记

            cnt++;
            int hash_suf = get_suf((i-1)*k+1,i*k);
            vis[hash_pre]=k; // k 时间标记
            vis[hash_suf]=k;
        }
        
        if(cnt>ans){ // 比当前答案大，重置 ans_cnt
            ans=cnt;
            ans_cnt=0;
        }
        if(cnt==ans){
            ans_k[ans_cnt++]=k;
        }
    }

    printf("%d %d\n",ans,ans_cnt);
    for(int i=0;i<ans_cnt;i++){
        printf("%d ",ans_k[i]);
    }
    return 0;
}

Antisymmetry

LOJ #2452. 「POI2010」反对称 Antisymmetry

对于一个 $0/1$ 字符串，如果将这个字符串 $0$ 和 $1$ 取反后，再将整个串反过来和原串一样，就称作「反对称」字符串。比如 $00001111$ 和 $010101$ 就是反对称的，而 $1001$ 就不是。
现在给出一个长度为 $n$ 的 $0/1$ 字符串，求它有多少个子串是反对称的，注意这里相同的子串出现在不同的位置会被重复计算， $1\le n\le 500\ 000$ 。

解法

$O(1)$ 判断任意子串是否是反对称是很简单的（用两个哈希，分别计算正串，和反+取反串的哈希）。因此，很容易想到一个 $O(n^2)$ 的算法，然而这是不够的。

观察到几个性质：

反对称子串的长度是偶数；
若 $s[l..r]$ 是反对称子串，则 $s[l+1..r-1]$ 是反对称子串。

因此，可以枚举对称轴，然后二分答案，找到最长可以延伸的范围。复杂度 $O(n\log n)$ 。

#include<bits/stdc++.h>
using namespace std;
using ll=long long;

const int N = 5e5+5;
char s[N];

int n;
const ll b=19260817,M=1e9+9;
ll h1[N],h2[N],bp[N];

bool check(int m,int len){ // 检查 [m-len+1,m+len]
    if(len==0)return 1; // 左边界
    ll hash1 = ((h1[m+len]-h1[m-len]*bp[len*2])%M+M)%M;
    ll hash2 = ((h2[m-len+1]-h2[m+len+1]*bp[len*2])%M+M)%M;
    return hash1==hash2;
}

int main(){
    int n;
    scanf("%d%s",&n,s+1);

    bp[0]=1;for(int i=1;i<N;i++)bp[i]=bp[i-1]*b%M;

    for(int i=1;i<=n;i++)h1[i]=(h1[i-1]*b+(s[i]-'0'))%M;
    for(int i=n;i>=1;i--)h2[i]=(h2[i+1]*b+((s[i]-'0')^1))%M; // 这里字符串反转了 (0->1,1->0)

    ll ans = 0;
    for(int k=1;k<=n;k++){
        int l=0,r=min(k,n-k);
        while(l<r){
            int mid = (l+r+1)>>1;
            if(check(k,mid))l=mid;
            else r=mid-1;
        }
        ans += l; // l 是以 k 为中心的反对称个数
    }
    printf("%lld\n",ans);
    return 0;
}