Run Infomap on GraphRAG-style tablesΒΆ

GraphRAG workflows commonly build hierarchical communities with Leiden. This notebook runs Infomap on the same entities.parquet and relationships.parquet shape, writes GraphRAG-style community outputs, and compares the result with a small Leiden baseline when igraph is installed.

from pathlib import Path
import tempfile

import pandas as pd

from infomap.graphrag import read_graphrag, run_graphrag_communities

try:
    import igraph as ig
except ImportError:
    ig = None
work_dir = Path(tempfile.mkdtemp(prefix="infomap-graphrag-"))
input_dir = work_dir / "input"
output_dir = work_dir / "infomap"
input_dir.mkdir()

entities = pd.DataFrame(
    {
        "id": ["a", "b", "c", "d", "e", "f"],
        "title": ["Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta"],
    }
)
relationships = pd.DataFrame(
    {
        "id": ["ab", "bc", "ca", "de", "ef", "fd", "cd"],
        "source": ["Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta", "Gamma"],
        "target": ["Beta", "Gamma", "Alpha", "Epsilon", "Zeta", "Delta", "Delta"],
        "weight": [2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 1.0],
    }
)

entities.to_parquet(input_dir / "entities.parquet")
relationships.to_parquet(input_dir / "relationships.parquet")
graph = read_graphrag(
    input_dir / "entities.parquet", input_dir / "relationships.parquet"
)

relationships.assign(source_node=graph.sources, target_node=graph.targets)
id source target weight source_node target_node
0 ab Alpha Beta 2.0 1 2
1 bc Beta Gamma 2.0 2 3
2 ca Gamma Alpha 2.0 3 1
3 de Delta Epsilon 3.0 4 5
4 ef Epsilon Zeta 3.0 5 6
5 fd Zeta Delta 3.0 6 4
6 cd Gamma Delta 1.0 3 4
result = run_graphrag_communities(
    input_dir=input_dir,
    output_dir=output_dir,
    silent=True,
    seed=123,
    num_trials=5,
)

result.infomap.codelength
1.9831517459081185
infomap_nodes = result.nodes
infomap_nodes
node_id entity_id entity_title module_id module_path level flow
0 1 a Alpha 1 [1] 1 0.12500
1 2 b Beta 1 [1] 1 0.12500
2 3 c Gamma 1 [1] 1 0.15625
3 4 d Delta 2 [2] 1 0.21875
4 5 e Epsilon 2 [2] 1 0.18750
5 6 f Zeta 2 [2] 1 0.18750
infomap_communities = result.communities
infomap_communities
id human_readable_id community parent children level title entity_ids relationship_ids text_unit_ids period size
0 infomap-1 1 1 -1 [] 0 Infomap community 1 [a, b, c] [ab, bc, ca] [] None 3
1 infomap-2 2 2 -1 [] 0 Infomap community 2 [d, e, f] [de, ef, fd] [] None 3
def _top_level_groups(communities):
    return [
        sorted(entity_ids)
        for entity_ids in communities.loc[communities["level"] == 0, "entity_ids"]
    ]


comparison_rows = [
    {
        "method": "Infomap",
        "number of communities": len(infomap_communities),
        "levels": int(infomap_communities["level"].nunique()),
        "largest community size": int(infomap_communities["size"].max()),
        "entity groups at top level": _top_level_groups(infomap_communities),
    }
]

if ig is None:
    print("Install python-igraph to run the Leiden comparison.")
else:
    leiden_graph = ig.Graph.TupleList(
        relationships[["source", "target", "weight"]].itertuples(
            index=False, name=None
        ),
        directed=False,
        edge_attrs=["weight"],
    )
    leiden_partition = leiden_graph.community_leiden(
        weights="weight",
        objective_function="modularity",
    )
    leiden_groups = [
        sorted(leiden_graph.vs[vertex]["name"] for vertex in community)
        for community in leiden_partition
    ]
    comparison_rows.append(
        {
            "method": "Leiden",
            "number of communities": len(leiden_partition),
            "levels": 1,
            "largest community size": max(len(group) for group in leiden_groups),
            "entity groups at top level": leiden_groups,
        }
    )

pd.DataFrame(comparison_rows)
method number of communities levels largest community size entity groups at top level
0 Infomap 2 1 3 [[a, b, c], [d, e, f]]
1 Leiden 2 1 3 [[Alpha, Beta, Gamma], [Delta, Epsilon, Zeta]]